diff --git a/.circleci/config.yml b/.circleci/config.yml index 366579ab..e7aaabae 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -22,6 +22,7 @@ jobs: conda config --set quiet true conda create -n merlin_env python=3.6 source activate merlin_env + conda install matplotlib conda install rtree conda install pytables cd ~ diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d3d05c3..c658e3f7 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,3 +45,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - An alternative Lucy-Richardson deconvolution approach that requires ~10x fewer iterations. +## [0.1.7] - +### Added +- Added option to put image data into a folder named "Data" and to save the data organization with the raw data in the root directory named dataorganization.csv diff --git a/merlin/analysis/generatemosaic.py b/merlin/analysis/generatemosaic.py index 430ea50b..ef17cef3 100755 --- a/merlin/analysis/generatemosaic.py +++ b/merlin/analysis/generatemosaic.py @@ -128,7 +128,8 @@ def _run_analysis(self): for z in zIndexes: with self.dataSet.writer_for_analysis_images( self, 'mosaic_%s_%i' - % (dataOrganization.get_data_channel_name(d), z))\ + % (dataOrganization.get_data_channel_name(d), z), + bigTiff = True, imagej = False)\ as outputTif: mosaic = self._prepare_mosaic_slice( z, d, micronExtents, alignTask, maximumProjection) diff --git a/merlin/analysis/optimize.py b/merlin/analysis/optimize.py index f182ab2b..875639fc 100755 --- a/merlin/analysis/optimize.py +++ b/merlin/analysis/optimize.py @@ -33,21 +33,16 @@ def __init__(self, dataSet, parameters=None, analysisName=None): if 'crop_width' not in self.parameters: self.parameters['crop_width'] = 0 - if 'fov_index' in self.parameters: + if 'fov_index' in self.parameters \ + and self.parameters['fov_index'] is not None: logger = self.dataSet.get_logger(self) logger.info('Setting fov_per_iteration to length of fov_index') self.parameters['fov_per_iteration'] = \ len(self.parameters['fov_index']) - else: - self.parameters['fov_index'] = [] - for i in range(self.parameters['fov_per_iteration']): - fovIndex = int(np.random.choice( - list(self.dataSet.get_fovs()))) - zIndex = int(np.random.choice( - list(range(len(self.dataSet.get_z_positions()))))) - self.parameters['fov_index'].append([fovIndex, zIndex]) + self.parameters['fov_index'] = None + def get_estimated_memory(self): return 4000 @@ -71,11 +66,21 @@ def get_codebook(self) -> Codebook: return preprocessTask.get_codebook() def _run_analysis(self, fragmentIndex): + logger = self.dataSet.get_logger(self) + preprocessTask = self.dataSet.load_analysis_task( self.parameters['preprocess_task']) codebook = self.get_codebook() - fovIndex, zIndex = self.parameters['fov_index'][fragmentIndex] + if self.parameters['fov_index'] is not None: + fovIndex, zIndex = self.parameters['fov_index'][fragmentIndex] + else: + fovIndex = int(np.random.choice( + list(self.dataSet.get_fovs()))) + zIndex = int(np.random.choice( + list(range(len(self.dataSet.get_z_positions()))))) + logger.info('Selected fov %i and z index %i for replicate %i' + % (fovIndex, zIndex, fragmentIndex)) scaleFactors = self._get_previous_scale_factors() backgrounds = self._get_previous_backgrounds() diff --git a/merlin/core/dataset.py b/merlin/core/dataset.py index bc120af6..dcbebd03 100755 --- a/merlin/core/dataset.py +++ b/merlin/core/dataset.py @@ -72,7 +72,10 @@ def __init__(self, dataDirectoryName: str, os.makedirs(self.analysisPath, exist_ok=True) self.logPath = os.sep.join([self.analysisPath, 'logs']) - os.makedirs(self.logPath, exist_ok=True) + try: + os.makedirs(self.logPath, exist_ok=True) + except PermissionError as e: + print("Unable to create logging directory") self._store_dataset_metadata() @@ -204,7 +207,7 @@ def get_analysis_image( def writer_for_analysis_images( self, analysisTask: TaskOrName, imageBaseName: str, - imageIndex: int = None, imagej: bool = True) -> tifffile.TiffWriter: + imageIndex: int = None, bigTiff = False, imagej: bool = True) -> tifffile.TiffWriter: """Get a writer for writing tiff files from an analysis task. Args: @@ -216,7 +219,8 @@ def writer_for_analysis_images( """ return tifffile.TiffWriter(self._analysis_image_name( - analysisTask, imageBaseName, imageIndex), imagej=imagej) + analysisTask, imageBaseName, imageIndex), bigtiff=bigTiff, + imagej=imagej) @staticmethod def analysis_tiff_description(sliceCount: int, frameCount: int) -> Dict: @@ -886,16 +890,30 @@ def __init__(self, dataDirectoryName: str, dataHome: str = None, if microscopeParametersName is not None: self._import_microscope_parameters(microscopeParametersName) - + + # try to find the image data in two locations. First in the Data + # subdirectory and then in the dataset directory + self.imageDataPath = os.sep.join([self.rawDataPath, 'Data']) + self.imageDataPortal = dataportal.DataPortal.create_portal( + self.imageDataPath) + if not self.imageDataPortal.is_available(): + # allow "data" to be used instead of "Data" + self.imageDataPath = os.sep.join([self.rawDataPath, 'data']) + self.imageDataPortal = dataportal.DataPortal.create_portal( + self.imageDataPath) + if not self.imageDataPortal.is_available(): + self.imageDataPath = self.rawDataPath + self.imageDataPortal = self.rawDataPortal + self._load_microscope_parameters() def get_image_file_names(self): - return sorted(self.rawDataPortal.list_files( + return sorted(self.imageDataPortal.list_files( extensionList=['.dax', '.tif', '.tiff'])) def load_image(self, imagePath, frameIndex): with imagereader.infer_reader( - self.rawDataPortal.open_file(imagePath)) as reader: + self.imageDataPortal.open_file(imagePath)) as reader: imageIn = reader.load_frame(int(frameIndex)) if self.transpose: imageIn = np.transpose(imageIn) @@ -913,7 +931,7 @@ def image_stack_size(self, imagePath): a three element list with [width, height, frameCount] or None if the file does not exist """ - with imagereader.infer_reader(self.rawDataPortal.open_file(imagePath) + with imagereader.infer_reader(self.imageDataPortal.open_file(imagePath) ) as reader: return reader.film_size() @@ -965,7 +983,7 @@ def get_image_xml_metadata(self, imagePath: str) -> Dict: imagePath: the path to the image file (.dax or .tif) Returns: the metadata from the associated xml file """ - filePortal = self.rawDataPortal.open_file( + filePortal = self.imageDataPortal.open_file( imagePath).get_sibling_with_extension('.xml') return xmltodict.parse(filePortal.read_as_text()) @@ -1005,7 +1023,7 @@ def __init__(self, dataDirectoryName: str, codebookNames: List[str] = None, microscopeParametersName) self.dataOrganization = dataorganization.DataOrganization( - self, dataOrganizationName) + self, dataOrganizationName, self.rawDataPortal) if codebookNames: self.codebooks = [codebook.Codebook(self, name, i) for i, name in enumerate(codebookNames)] diff --git a/merlin/data/dataorganization.py b/merlin/data/dataorganization.py index 1fa584d3..61d28c93 100755 --- a/merlin/data/dataorganization.py +++ b/merlin/data/dataorganization.py @@ -4,9 +4,11 @@ from typing import Tuple import pandas import numpy as np +from io import StringIO import merlin from merlin.core import dataset +from merlin.util import dataportal def _parse_list(inputString: str, dtype=float): @@ -31,15 +33,19 @@ class DataOrganization(object): image files. """ - def __init__(self, dataSet, filePath: str = None): + def __init__(self, dataSet, filePath: str = None, + dataPortal: dataportal.DataPortal = None): """ Create a new DataOrganization for the data in the specified data set. - If filePath is not specified, a previously stored DataOrganization - is loaded from the dataSet if it exists. If filePath is specified, - the DataOrganization at the specified filePath is loaded and - stored in the dataSet, overwriting any previously stored - DataOrganization. + The DataOrganization is located in the following search order: + i) If filePath is specified and filePath exists this file is copied + into the data set analysis directory and used as the datorganization + ii) If dataPortal is specified and contains a file named + "dataorganization.csv", this file will be copied into the + data set analysis directory and used as the data organization. + iii) If neither filePath or dataPortal are specified, the previously + stored dataorganization is used. Raises: InputDataError: If the set of raw data is incomplete or the @@ -47,24 +53,39 @@ def __init__(self, dataSet, filePath: str = None): """ self._dataSet = dataSet + self.data = None if filePath is not None: if not os.path.exists(filePath): filePath = os.sep.join( [merlin.DATA_ORGANIZATION_HOME, filePath]) - self.data = pandas.read_csv( filePath, converters={'frame': _parse_int_list, 'zPos': _parse_list}) - self.data['readoutName'] = self.data['readoutName'].str.strip() - self._dataSet.save_dataframe_to_csv( - self.data, 'dataorganization', index=False) - else: + if self.data is None and dataPortal is not None: + try: + self.data = pandas.read_csv(StringIO(dataPortal.open_file( + 'dataorganization.csv').read_as_text()), + converters={'frame': _parse_int_list, 'zPos': _parse_list}) + # this could be many different exceptions so for now it can remain + # broad. If data can't be loaded from the data portal we load it + # from the dataset before + except Exception: + pass + + if self.data is None: self.data = self._dataSet.load_dataframe_from_csv( 'dataorganization', converters={'frame': _parse_int_list, 'zPos': _parse_list}) + self.data['readoutName'] = self.data['readoutName'].str.strip() + try: + self._dataSet.save_dataframe_to_csv( + self.data, 'dataorganization', index=False) + except PermissionError as e: + print('Unable to save data organization.') + stringColumns = ['readoutName', 'channelName', 'imageType', 'imageRegExp', 'fiducialImageType', 'fiducialRegExp'] self.data[stringColumns] = self.data[stringColumns].astype('str') @@ -256,8 +277,7 @@ def _get_image_path( (self.fileMap['fov'] == fov) & (self.fileMap['imagingRound'] == imagingRound)] filemapPath = selection['imagePath'].values[0] - return os.path.join(self._dataSet.dataHome, self._dataSet.dataSetName, - filemapPath) + return os.path.join(self._dataSet.imageDataPath, filemapPath) def _truncate_file_path(self, path) -> None: head, tail = os.path.split(path) @@ -283,7 +303,7 @@ def _map_image_files(self) -> None: fileNames = self._dataSet.get_image_file_names() if len(fileNames) == 0: raise dataset.DataFormatException( - 'No image files found at %s.' % self._dataSet.rawDataPath) + 'No image files found at %s.' % self._dataSet.imageDataPath) fileData = [] for currentType, currentIndex in zip(uniqueTypes, uniqueIndexes): matchRE = re.compile( @@ -343,7 +363,7 @@ def _validate_file_map(self) -> None: (channelInfo['imageType'], fov, channelInfo['imagingRound'])) - if not self._dataSet.rawDataPortal.open_file( + if not self._dataSet.imageDataPortal.open_file( imagePath).exists(): raise InputDataError( ('Image data for channel {0} and fov {1} not found. ' diff --git a/merlin/merlin.py b/merlin/merlin.py index c892baa3..80b09ca0 100755 --- a/merlin/merlin.py +++ b/merlin/merlin.py @@ -56,9 +56,10 @@ def build_parser(): help='the analysis home directory') parser.add_argument('-k', '--snakemake-parameters', help='the name of the snakemake parameters file') - parser.add_argument('--no_report', - help='flag indicating that the snakemake stats ' + - 'should not be shared to improve MERlin') + parser.add_argument('--report-path', + help='The path to send a report of the MERlin run to.' + + 'If no report path is specified, no MERlin run ' + + 'information is shared.') return parser @@ -144,7 +145,7 @@ def merlin(): snakemakeParameters = json.load(f) run_with_snakemake(dataSet, snakefilePath, args.core_count, - snakemakeParameters, not args.no_report) + snakemakeParameters, args.report_path) def generate_analysis_tasks_and_snakefile(dataSet: dataset.MERFISHDataSet, @@ -160,18 +161,24 @@ def generate_analysis_tasks_and_snakefile(dataSet: dataset.MERFISHDataSet, def run_with_snakemake( dataSet: dataset.MERFISHDataSet, snakefilePath: str, coreCount: int, - snakemakeParameters: Dict = {}, report: bool = True): + snakemakeParameters: Dict = {}, reportPath: str = None): print('Running MERlin pipeline through snakemake') + + if 'restart_times' not in snakemakeParameters: + snakemakeParameters['restart_times'] = 3 + if 'latency_wait' not in snakemakeParameters: + snakemakeParameters['latency_wait'] = 60 + snakemake.snakemake(snakefilePath, cores=coreCount, workdir=dataSet.get_snakemake_path(), stats=snakefilePath + '.stats', lock=False, **snakemakeParameters) - if report: + if reportPath: reportTime = int(time.time()) try: with open(snakefilePath + '.stats', 'r') as f: - requests.post('http://merlin.georgeemanuel.com/post', + requests.post(reportPath, files={ 'file': ( '.'.join( @@ -200,7 +207,7 @@ def run_with_snakemake( 'analysis_parameters': analysisParameters } try: - requests.post('http://merlin.georgeemanuel.com/post', + requests.post(reportPath, files={'file': ('.'.join( [dataSet.dataSetName, str(reportTime)]) diff --git a/merlin/util/dataportal.py b/merlin/util/dataportal.py index 3c8d2bb6..d7461122 100755 --- a/merlin/util/dataportal.py +++ b/merlin/util/dataportal.py @@ -116,8 +116,8 @@ def __init__(self, basePath: str, **kwargs): self._s3 = boto3.resource('s3', **kwargs) def is_available(self): - objects = list(self._s3.Bucket(self._bucketName).objects.limit(10) - .filter(Prefix=self._prefix)) + objects = list(self._s3.Bucket(self._bucketName).objects + .filter(Prefix=self._prefix).limit(10)) return len(objects) > 0 def open_file(self, fileName): @@ -256,6 +256,9 @@ class LocalFilePortal(FilePortal): def __init__(self, fileName: str): super().__init__(fileName) + self._fileHandle = None + if not os.path.exists(fileName): + raise FileNotFoundError self._fileHandle = open(fileName, 'rb') def get_sibling_with_extension(self, newExtension: str): @@ -273,7 +276,8 @@ def read_file_bytes(self, startByte, endByte): return self._fileHandle.read(endByte-startByte) def close(self) -> None: - self._fileHandle.close() + if self._fileHandle is not None: + self._fileHandle.close() class S3FilePortal(FilePortal):