emanuega · emanuega · Jun 14, 2020 · Jun 14, 2020 · Jun 14, 2020 · Jun 14, 2020
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -22,6 +22,7 @@ jobs:
             conda config --set quiet true
             conda create -n merlin_env python=3.6
             source activate merlin_env
+            conda install matplotlib
             conda install rtree
             conda install pytables
             cd ~

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -45,3 +45,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - An alternative Lucy-Richardson deconvolution approach that requires ~10x fewer iterations.
 
+## [0.1.7] - 
+### Added
+- Added option to put image data into a folder named "Data" and to save the data organization with the raw data in the root directory named dataorganization.csv 
diff --git a/merlin/analysis/generatemosaic.py b/merlin/analysis/generatemosaic.py
@@ -128,7 +128,8 @@ def _run_analysis(self):
                 for z in zIndexes:
                     with self.dataSet.writer_for_analysis_images(
                         self, 'mosaic_%s_%i'
-                              % (dataOrganization.get_data_channel_name(d), z))\
+                              % (dataOrganization.get_data_channel_name(d), z),
+                              bigTiff = True, imagej = False)\
                             as outputTif:
                         mosaic = self._prepare_mosaic_slice(
                             z, d, micronExtents, alignTask, maximumProjection)

diff --git a/merlin/analysis/optimize.py b/merlin/analysis/optimize.py
@@ -33,21 +33,16 @@ def __init__(self, dataSet, parameters=None, analysisName=None):
         if 'crop_width' not in self.parameters:
             self.parameters['crop_width'] = 0
 
-        if 'fov_index' in self.parameters:
+        if 'fov_index' in self.parameters \
+                and self.parameters['fov_index'] is not None:
             logger = self.dataSet.get_logger(self)
             logger.info('Setting fov_per_iteration to length of fov_index')
 
             self.parameters['fov_per_iteration'] = \
                 len(self.parameters['fov_index'])
-
         else:
-            self.parameters['fov_index'] = []
-            for i in range(self.parameters['fov_per_iteration']):
-                fovIndex = int(np.random.choice(
-                    list(self.dataSet.get_fovs())))
-                zIndex = int(np.random.choice(
-                    list(range(len(self.dataSet.get_z_positions())))))
-                self.parameters['fov_index'].append([fovIndex, zIndex])
+            self.parameters['fov_index'] = None
+
 
     def get_estimated_memory(self):
         return 4000
@@ -71,11 +66,21 @@ def get_codebook(self) -> Codebook:
         return preprocessTask.get_codebook()
 
     def _run_analysis(self, fragmentIndex):
+        logger = self.dataSet.get_logger(self)
+
         preprocessTask = self.dataSet.load_analysis_task(
                 self.parameters['preprocess_task'])
         codebook = self.get_codebook()
 
-        fovIndex, zIndex = self.parameters['fov_index'][fragmentIndex]
+        if self.parameters['fov_index'] is not None:
+            fovIndex, zIndex = self.parameters['fov_index'][fragmentIndex]
+        else:
+            fovIndex = int(np.random.choice(
+                list(self.dataSet.get_fovs())))
+            zIndex = int(np.random.choice(
+                list(range(len(self.dataSet.get_z_positions())))))
+            logger.info('Selected fov %i and z index %i for replicate %i'
+                        % (fovIndex, zIndex, fragmentIndex))
 
         scaleFactors = self._get_previous_scale_factors()
         backgrounds = self._get_previous_backgrounds()

diff --git a/merlin/core/dataset.py b/merlin/core/dataset.py
@@ -72,7 +72,10 @@ def __init__(self, dataDirectoryName: str,
         os.makedirs(self.analysisPath, exist_ok=True)
 
         self.logPath = os.sep.join([self.analysisPath, 'logs'])
-        os.makedirs(self.logPath, exist_ok=True)
+        try: 
+            os.makedirs(self.logPath, exist_ok=True)
+        except PermissionError as e:
+            print("Unable to create logging directory")
 
         self._store_dataset_metadata()
 
@@ -204,7 +207,7 @@ def get_analysis_image(
 
     def writer_for_analysis_images(
             self, analysisTask: TaskOrName, imageBaseName: str,
-            imageIndex: int = None, imagej: bool = True) -> tifffile.TiffWriter:
+            imageIndex: int = None, bigTiff = False, imagej: bool = True) -> tifffile.TiffWriter:
         """Get a writer for writing tiff files from an analysis task.
 
         Args:
@@ -216,7 +219,8 @@ def writer_for_analysis_images(
 
         """
         return tifffile.TiffWriter(self._analysis_image_name(
-            analysisTask, imageBaseName, imageIndex), imagej=imagej)
+            analysisTask, imageBaseName, imageIndex), bigtiff=bigTiff, 
+            imagej=imagej)
 
     @staticmethod
     def analysis_tiff_description(sliceCount: int, frameCount: int) -> Dict:
@@ -886,16 +890,30 @@ def __init__(self, dataDirectoryName: str, dataHome: str = None,
 
         if microscopeParametersName is not None:
             self._import_microscope_parameters(microscopeParametersName)
-
+
+        # try to find the image data in two locations. First in the Data
+        # subdirectory and then in the dataset directory
+        self.imageDataPath = os.sep.join([self.rawDataPath, 'Data'])
+        self.imageDataPortal = dataportal.DataPortal.create_portal(
+            self.imageDataPath)
+        if not self.imageDataPortal.is_available():
+            # allow "data" to be used instead of "Data"
+            self.imageDataPath = os.sep.join([self.rawDataPath, 'data'])
+            self.imageDataPortal = dataportal.DataPortal.create_portal(
+                self.imageDataPath)
+            if not self.imageDataPortal.is_available():
+                self.imageDataPath = self.rawDataPath
+                self.imageDataPortal = self.rawDataPortal
+
         self._load_microscope_parameters()
 
     def get_image_file_names(self):
-        return sorted(self.rawDataPortal.list_files(
+        return sorted(self.imageDataPortal.list_files(
             extensionList=['.dax', '.tif', '.tiff']))
 
     def load_image(self, imagePath, frameIndex):
         with imagereader.infer_reader(
-                self.rawDataPortal.open_file(imagePath)) as reader:
+                self.imageDataPortal.open_file(imagePath)) as reader:
             imageIn = reader.load_frame(int(frameIndex))
             if self.transpose:
                 imageIn = np.transpose(imageIn)
@@ -913,7 +931,7 @@ def image_stack_size(self, imagePath):
             a three element list with [width, height, frameCount] or None
                     if the file does not exist
         """
-        with imagereader.infer_reader(self.rawDataPortal.open_file(imagePath)
+        with imagereader.infer_reader(self.imageDataPortal.open_file(imagePath)
                                       ) as reader:
             return reader.film_size()
 
@@ -965,7 +983,7 @@ def get_image_xml_metadata(self, imagePath: str) -> Dict:
             imagePath: the path to the image file (.dax or .tif)
         Returns: the metadata from the associated xml file
         """
-        filePortal = self.rawDataPortal.open_file(
+        filePortal = self.imageDataPortal.open_file(
             imagePath).get_sibling_with_extension('.xml')
         return xmltodict.parse(filePortal.read_as_text())
 
@@ -1005,7 +1023,7 @@ def __init__(self, dataDirectoryName: str, codebookNames: List[str] = None,
                          microscopeParametersName)
 
         self.dataOrganization = dataorganization.DataOrganization(
-                self, dataOrganizationName)
+                self, dataOrganizationName, self.rawDataPortal)
         if codebookNames:
             self.codebooks = [codebook.Codebook(self, name, i)
                               for i, name in enumerate(codebookNames)]

diff --git a/merlin/data/dataorganization.py b/merlin/data/dataorganization.py
@@ -4,9 +4,11 @@
 from typing import Tuple
 import pandas
 import numpy as np
+from io import StringIO
 
 import merlin
 from merlin.core import dataset
+from merlin.util import dataportal
 
 
 def _parse_list(inputString: str, dtype=float):
@@ -31,40 +33,59 @@ class DataOrganization(object):
     image files.
     """
 
-    def __init__(self, dataSet, filePath: str = None):
+    def __init__(self, dataSet, filePath: str = None,
+                 dataPortal: dataportal.DataPortal = None):
         """
         Create a new DataOrganization for the data in the specified data set.
 
-        If filePath is not specified, a previously stored DataOrganization
-        is loaded from the dataSet if it exists. If filePath is specified,
-        the DataOrganization at the specified filePath is loaded and
-        stored in the dataSet, overwriting any previously stored
-        DataOrganization.
+        The DataOrganization is located in the following search order:
+        i) If filePath is specified and filePath exists this file is copied
+        into the data set analysis directory and used as the datorganization
+        ii) If dataPortal is specified and contains a file named
+        "dataorganization.csv", this file will be copied into the
+        data set analysis directory and used as the data organization.
+        iii) If neither filePath or dataPortal are specified, the previously
+        stored dataorganization is used.
 
         Raises:
             InputDataError: If the set of raw data is incomplete or the
                     format of the raw data deviates from expectations.
         """
 
         self._dataSet = dataSet
+        self.data = None
 
         if filePath is not None:
             if not os.path.exists(filePath):
                 filePath = os.sep.join(
                         [merlin.DATA_ORGANIZATION_HOME, filePath])
-
             self.data = pandas.read_csv(
                 filePath,
                 converters={'frame': _parse_int_list, 'zPos': _parse_list})
-            self.data['readoutName'] = self.data['readoutName'].str.strip()
-            self._dataSet.save_dataframe_to_csv(
-                    self.data, 'dataorganization', index=False)
 
-        else:
+        if self.data is None and dataPortal is not None:
+            try:
+                self.data = pandas.read_csv(StringIO(dataPortal.open_file(
+                    'dataorganization.csv').read_as_text()),
+                    converters={'frame': _parse_int_list, 'zPos': _parse_list})
+            # this could be many different exceptions so for now it can remain
+            # broad. If data can't be loaded from the data portal we load it
+            # from the dataset before
+            except Exception:
+                pass
+
+        if self.data is None:
             self.data = self._dataSet.load_dataframe_from_csv(
                 'dataorganization',
                 converters={'frame': _parse_int_list, 'zPos': _parse_list})
 
+        self.data['readoutName'] = self.data['readoutName'].str.strip()
+        try:
+            self._dataSet.save_dataframe_to_csv(
+                self.data, 'dataorganization', index=False)
+        except PermissionError as e:
+            print('Unable to save data organization.')
+
         stringColumns = ['readoutName', 'channelName', 'imageType',
                          'imageRegExp', 'fiducialImageType', 'fiducialRegExp']
         self.data[stringColumns] = self.data[stringColumns].astype('str')
@@ -256,8 +277,7 @@ def _get_image_path(
                                  (self.fileMap['fov'] == fov) &
                                  (self.fileMap['imagingRound'] == imagingRound)]
         filemapPath = selection['imagePath'].values[0]
-        return os.path.join(self._dataSet.dataHome, self._dataSet.dataSetName,
-                            filemapPath)
+        return os.path.join(self._dataSet.imageDataPath, filemapPath)
 
     def _truncate_file_path(self, path) -> None:
         head, tail = os.path.split(path)
@@ -283,7 +303,7 @@ def _map_image_files(self) -> None:
             fileNames = self._dataSet.get_image_file_names()
             if len(fileNames) == 0:
                 raise dataset.DataFormatException(
-                    'No image files found at %s.' % self._dataSet.rawDataPath)
+                    'No image files found at %s.' % self._dataSet.imageDataPath)
             fileData = []
             for currentType, currentIndex in zip(uniqueTypes, uniqueIndexes):
                 matchRE = re.compile(
@@ -343,7 +363,7 @@ def _validate_file_map(self) -> None:
                         (channelInfo['imageType'], fov,
                          channelInfo['imagingRound']))
 
-                if not self._dataSet.rawDataPortal.open_file(
+                if not self._dataSet.imageDataPortal.open_file(
                         imagePath).exists():
                     raise InputDataError(
                         ('Image data for channel {0} and fov {1} not found. '

diff --git a/merlin/merlin.py b/merlin/merlin.py
@@ -56,9 +56,10 @@ def build_parser():
                         help='the analysis home directory')
     parser.add_argument('-k', '--snakemake-parameters',
                         help='the name of the snakemake parameters file')
-    parser.add_argument('--no_report',
-                        help='flag indicating that the snakemake stats ' +
-                        'should not be shared to improve MERlin')
+    parser.add_argument('--report-path',
+                        help='The path to send a report of the MERlin run to.' +
+                        'If no report path is specified, no MERlin run ' +
+                        'information is shared.')
 
     return parser
 
@@ -144,7 +145,7 @@ def merlin():
                     snakemakeParameters = json.load(f)
 
             run_with_snakemake(dataSet, snakefilePath, args.core_count,
-                               snakemakeParameters, not args.no_report)
+                               snakemakeParameters, args.report_path)
 
 
 def generate_analysis_tasks_and_snakefile(dataSet: dataset.MERFISHDataSet,
@@ -160,18 +161,24 @@ def generate_analysis_tasks_and_snakefile(dataSet: dataset.MERFISHDataSet,
 
 def run_with_snakemake(
         dataSet: dataset.MERFISHDataSet, snakefilePath: str, coreCount: int,
-        snakemakeParameters: Dict = {}, report: bool = True):
+        snakemakeParameters: Dict = {}, reportPath: str = None):
     print('Running MERlin pipeline through snakemake')
+
+    if 'restart_times' not in snakemakeParameters:
+        snakemakeParameters['restart_times'] = 3
+    if 'latency_wait' not in snakemakeParameters:
+        snakemakeParameters['latency_wait'] = 60
+
     snakemake.snakemake(snakefilePath, cores=coreCount,
                         workdir=dataSet.get_snakemake_path(),
                         stats=snakefilePath + '.stats', lock=False,
                         **snakemakeParameters)
 
-    if report:
+    if reportPath:
         reportTime = int(time.time())
         try:
             with open(snakefilePath + '.stats', 'r') as f:
-                requests.post('http://merlin.georgeemanuel.com/post',
+                requests.post(reportPath,
                               files={
                                   'file': (
                                       '.'.join(
@@ -200,7 +207,7 @@ def run_with_snakemake(
             'analysis_parameters': analysisParameters
         }
         try:
-            requests.post('http://merlin.georgeemanuel.com/post',
+            requests.post(reportPath,
                           files={'file': ('.'.join(
                               [dataSet.dataSetName,
                                str(reportTime)])

diff --git a/merlin/util/dataportal.py b/merlin/util/dataportal.py
@@ -116,8 +116,8 @@ def __init__(self, basePath: str, **kwargs):
         self._s3 = boto3.resource('s3', **kwargs)
 
     def is_available(self):
-        objects = list(self._s3.Bucket(self._bucketName).objects.limit(10)
-                       .filter(Prefix=self._prefix))
+        objects = list(self._s3.Bucket(self._bucketName).objects
+                       .filter(Prefix=self._prefix).limit(10))
         return len(objects) > 0
 
     def open_file(self, fileName):
@@ -256,6 +256,9 @@ class LocalFilePortal(FilePortal):
 
     def __init__(self, fileName: str):
         super().__init__(fileName)
+        self._fileHandle = None
+        if not os.path.exists(fileName):
+            raise FileNotFoundError
         self._fileHandle = open(fileName, 'rb')
 
     def get_sibling_with_extension(self, newExtension: str):
@@ -273,7 +276,8 @@ def read_file_bytes(self, startByte, endByte):
         return self._fileHandle.read(endByte-startByte)
 
     def close(self) -> None:
-        self._fileHandle.close()
+        if self._fileHandle is not None:
+            self._fileHandle.close()
 
 
 class S3FilePortal(FilePortal):