Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ jobs:
conda config --set quiet true
conda create -n merlin_env python=3.6
source activate merlin_env
conda install matplotlib
conda install rtree
conda install pytables
cd ~
Expand Down
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- An alternative Lucy-Richardson deconvolution approach that requires ~10x fewer iterations.

## [0.1.7] -
### Added
- Added option to put image data into a folder named "Data" and to save the data organization with the raw data in the root directory named dataorganization.csv
3 changes: 2 additions & 1 deletion merlin/analysis/generatemosaic.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ def _run_analysis(self):
for z in zIndexes:
with self.dataSet.writer_for_analysis_images(
self, 'mosaic_%s_%i'
% (dataOrganization.get_data_channel_name(d), z))\
% (dataOrganization.get_data_channel_name(d), z),
bigTiff = True, imagej = False)\
as outputTif:
mosaic = self._prepare_mosaic_slice(
z, d, micronExtents, alignTask, maximumProjection)
Expand Down
25 changes: 15 additions & 10 deletions merlin/analysis/optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,16 @@ def __init__(self, dataSet, parameters=None, analysisName=None):
if 'crop_width' not in self.parameters:
self.parameters['crop_width'] = 0

if 'fov_index' in self.parameters:
if 'fov_index' in self.parameters \
and self.parameters['fov_index'] is not None:
logger = self.dataSet.get_logger(self)
logger.info('Setting fov_per_iteration to length of fov_index')

self.parameters['fov_per_iteration'] = \
len(self.parameters['fov_index'])

else:
self.parameters['fov_index'] = []
for i in range(self.parameters['fov_per_iteration']):
fovIndex = int(np.random.choice(
list(self.dataSet.get_fovs())))
zIndex = int(np.random.choice(
list(range(len(self.dataSet.get_z_positions())))))
self.parameters['fov_index'].append([fovIndex, zIndex])
self.parameters['fov_index'] = None


def get_estimated_memory(self):
return 4000
Expand All @@ -71,11 +66,21 @@ def get_codebook(self) -> Codebook:
return preprocessTask.get_codebook()

def _run_analysis(self, fragmentIndex):
logger = self.dataSet.get_logger(self)

preprocessTask = self.dataSet.load_analysis_task(
self.parameters['preprocess_task'])
codebook = self.get_codebook()

fovIndex, zIndex = self.parameters['fov_index'][fragmentIndex]
if self.parameters['fov_index'] is not None:
fovIndex, zIndex = self.parameters['fov_index'][fragmentIndex]
else:
fovIndex = int(np.random.choice(
list(self.dataSet.get_fovs())))
zIndex = int(np.random.choice(
list(range(len(self.dataSet.get_z_positions())))))
logger.info('Selected fov %i and z index %i for replicate %i'
% (fovIndex, zIndex, fragmentIndex))

scaleFactors = self._get_previous_scale_factors()
backgrounds = self._get_previous_backgrounds()
Expand Down
36 changes: 27 additions & 9 deletions merlin/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,10 @@ def __init__(self, dataDirectoryName: str,
os.makedirs(self.analysisPath, exist_ok=True)

self.logPath = os.sep.join([self.analysisPath, 'logs'])
os.makedirs(self.logPath, exist_ok=True)
try:
os.makedirs(self.logPath, exist_ok=True)
except PermissionError as e:
print("Unable to create logging directory")

self._store_dataset_metadata()

Expand Down Expand Up @@ -204,7 +207,7 @@ def get_analysis_image(

def writer_for_analysis_images(
self, analysisTask: TaskOrName, imageBaseName: str,
imageIndex: int = None, imagej: bool = True) -> tifffile.TiffWriter:
imageIndex: int = None, bigTiff = False, imagej: bool = True) -> tifffile.TiffWriter:
"""Get a writer for writing tiff files from an analysis task.

Args:
Expand All @@ -216,7 +219,8 @@ def writer_for_analysis_images(

"""
return tifffile.TiffWriter(self._analysis_image_name(
analysisTask, imageBaseName, imageIndex), imagej=imagej)
analysisTask, imageBaseName, imageIndex), bigtiff=bigTiff,
imagej=imagej)

@staticmethod
def analysis_tiff_description(sliceCount: int, frameCount: int) -> Dict:
Expand Down Expand Up @@ -886,16 +890,30 @@ def __init__(self, dataDirectoryName: str, dataHome: str = None,

if microscopeParametersName is not None:
self._import_microscope_parameters(microscopeParametersName)


# try to find the image data in two locations. First in the Data
# subdirectory and then in the dataset directory
self.imageDataPath = os.sep.join([self.rawDataPath, 'Data'])
self.imageDataPortal = dataportal.DataPortal.create_portal(
self.imageDataPath)
if not self.imageDataPortal.is_available():
# allow "data" to be used instead of "Data"
self.imageDataPath = os.sep.join([self.rawDataPath, 'data'])
self.imageDataPortal = dataportal.DataPortal.create_portal(
self.imageDataPath)
if not self.imageDataPortal.is_available():
self.imageDataPath = self.rawDataPath
self.imageDataPortal = self.rawDataPortal

self._load_microscope_parameters()

def get_image_file_names(self):
return sorted(self.rawDataPortal.list_files(
return sorted(self.imageDataPortal.list_files(
extensionList=['.dax', '.tif', '.tiff']))

def load_image(self, imagePath, frameIndex):
with imagereader.infer_reader(
self.rawDataPortal.open_file(imagePath)) as reader:
self.imageDataPortal.open_file(imagePath)) as reader:
imageIn = reader.load_frame(int(frameIndex))
if self.transpose:
imageIn = np.transpose(imageIn)
Expand All @@ -913,7 +931,7 @@ def image_stack_size(self, imagePath):
a three element list with [width, height, frameCount] or None
if the file does not exist
"""
with imagereader.infer_reader(self.rawDataPortal.open_file(imagePath)
with imagereader.infer_reader(self.imageDataPortal.open_file(imagePath)
) as reader:
return reader.film_size()

Expand Down Expand Up @@ -965,7 +983,7 @@ def get_image_xml_metadata(self, imagePath: str) -> Dict:
imagePath: the path to the image file (.dax or .tif)
Returns: the metadata from the associated xml file
"""
filePortal = self.rawDataPortal.open_file(
filePortal = self.imageDataPortal.open_file(
imagePath).get_sibling_with_extension('.xml')
return xmltodict.parse(filePortal.read_as_text())

Expand Down Expand Up @@ -1005,7 +1023,7 @@ def __init__(self, dataDirectoryName: str, codebookNames: List[str] = None,
microscopeParametersName)

self.dataOrganization = dataorganization.DataOrganization(
self, dataOrganizationName)
self, dataOrganizationName, self.rawDataPortal)
if codebookNames:
self.codebooks = [codebook.Codebook(self, name, i)
for i, name in enumerate(codebookNames)]
Expand Down
50 changes: 35 additions & 15 deletions merlin/data/dataorganization.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
from typing import Tuple
import pandas
import numpy as np
from io import StringIO

import merlin
from merlin.core import dataset
from merlin.util import dataportal


def _parse_list(inputString: str, dtype=float):
Expand All @@ -31,40 +33,59 @@ class DataOrganization(object):
image files.
"""

def __init__(self, dataSet, filePath: str = None):
def __init__(self, dataSet, filePath: str = None,
dataPortal: dataportal.DataPortal = None):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should there be a way to add a dataportal when constructing an analysis folder with merlin.py?

"""
Create a new DataOrganization for the data in the specified data set.

If filePath is not specified, a previously stored DataOrganization
is loaded from the dataSet if it exists. If filePath is specified,
the DataOrganization at the specified filePath is loaded and
stored in the dataSet, overwriting any previously stored
DataOrganization.
The DataOrganization is located in the following search order:
i) If filePath is specified and filePath exists this file is copied
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you change this to indicate to the user that it first looks for the filepath, if it doesn't find that it falls back to looking for the filepath prepended with merlin.DATA_ORGANIZATION_HOME. It also seems like we should issue a warning or error to the user if they have something in filepath that doesn't exist in either location, in case it is able to get a dataorganiation file from one of the other two if statements.

into the data set analysis directory and used as the datorganization
ii) If dataPortal is specified and contains a file named
"dataorganization.csv", this file will be copied into the
data set analysis directory and used as the data organization.
iii) If neither filePath or dataPortal are specified, the previously
stored dataorganization is used.

Raises:
InputDataError: If the set of raw data is incomplete or the
format of the raw data deviates from expectations.
"""

self._dataSet = dataSet
self.data = None

if filePath is not None:
if not os.path.exists(filePath):
filePath = os.sep.join(
[merlin.DATA_ORGANIZATION_HOME, filePath])

self.data = pandas.read_csv(
filePath,
converters={'frame': _parse_int_list, 'zPos': _parse_list})
self.data['readoutName'] = self.data['readoutName'].str.strip()
self._dataSet.save_dataframe_to_csv(
self.data, 'dataorganization', index=False)

else:
if self.data is None and dataPortal is not None:
try:
self.data = pandas.read_csv(StringIO(dataPortal.open_file(
'dataorganization.csv').read_as_text()),
converters={'frame': _parse_int_list, 'zPos': _parse_list})
# this could be many different exceptions so for now it can remain
# broad. If data can't be loaded from the data portal we load it
# from the dataset before
except Exception:
pass

if self.data is None:
self.data = self._dataSet.load_dataframe_from_csv(
'dataorganization',
converters={'frame': _parse_int_list, 'zPos': _parse_list})

self.data['readoutName'] = self.data['readoutName'].str.strip()
try:
self._dataSet.save_dataframe_to_csv(
self.data, 'dataorganization', index=False)
except PermissionError as e:
print('Unable to save data organization.')

stringColumns = ['readoutName', 'channelName', 'imageType',
'imageRegExp', 'fiducialImageType', 'fiducialRegExp']
self.data[stringColumns] = self.data[stringColumns].astype('str')
Expand Down Expand Up @@ -256,8 +277,7 @@ def _get_image_path(
(self.fileMap['fov'] == fov) &
(self.fileMap['imagingRound'] == imagingRound)]
filemapPath = selection['imagePath'].values[0]
return os.path.join(self._dataSet.dataHome, self._dataSet.dataSetName,
filemapPath)
return os.path.join(self._dataSet.imageDataPath, filemapPath)

def _truncate_file_path(self, path) -> None:
head, tail = os.path.split(path)
Expand All @@ -283,7 +303,7 @@ def _map_image_files(self) -> None:
fileNames = self._dataSet.get_image_file_names()
if len(fileNames) == 0:
raise dataset.DataFormatException(
'No image files found at %s.' % self._dataSet.rawDataPath)
'No image files found at %s.' % self._dataSet.imageDataPath)
fileData = []
for currentType, currentIndex in zip(uniqueTypes, uniqueIndexes):
matchRE = re.compile(
Expand Down Expand Up @@ -343,7 +363,7 @@ def _validate_file_map(self) -> None:
(channelInfo['imageType'], fov,
channelInfo['imagingRound']))

if not self._dataSet.rawDataPortal.open_file(
if not self._dataSet.imageDataPortal.open_file(
imagePath).exists():
raise InputDataError(
('Image data for channel {0} and fov {1} not found. '
Expand Down
23 changes: 15 additions & 8 deletions merlin/merlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,10 @@ def build_parser():
help='the analysis home directory')
parser.add_argument('-k', '--snakemake-parameters',
help='the name of the snakemake parameters file')
parser.add_argument('--no_report',
help='flag indicating that the snakemake stats ' +
'should not be shared to improve MERlin')
parser.add_argument('--report-path',
help='The path to send a report of the MERlin run to.' +
'If no report path is specified, no MERlin run ' +
'information is shared.')

return parser

Expand Down Expand Up @@ -144,7 +145,7 @@ def merlin():
snakemakeParameters = json.load(f)

run_with_snakemake(dataSet, snakefilePath, args.core_count,
snakemakeParameters, not args.no_report)
snakemakeParameters, args.report_path)


def generate_analysis_tasks_and_snakefile(dataSet: dataset.MERFISHDataSet,
Expand All @@ -160,18 +161,24 @@ def generate_analysis_tasks_and_snakefile(dataSet: dataset.MERFISHDataSet,

def run_with_snakemake(
dataSet: dataset.MERFISHDataSet, snakefilePath: str, coreCount: int,
snakemakeParameters: Dict = {}, report: bool = True):
snakemakeParameters: Dict = {}, reportPath: str = None):
print('Running MERlin pipeline through snakemake')

if 'restart_times' not in snakemakeParameters:
snakemakeParameters['restart_times'] = 3
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I worry that the typical user would not expect this behavior and might not understand why they have one job restarting itself. Could we log this somewhere the user might expect to find this type of information?

if 'latency_wait' not in snakemakeParameters:
snakemakeParameters['latency_wait'] = 60

snakemake.snakemake(snakefilePath, cores=coreCount,
workdir=dataSet.get_snakemake_path(),
stats=snakefilePath + '.stats', lock=False,
**snakemakeParameters)

if report:
if reportPath:
reportTime = int(time.time())
try:
with open(snakefilePath + '.stats', 'r') as f:
requests.post('http://merlin.georgeemanuel.com/post',
requests.post(reportPath,
files={
'file': (
'.'.join(
Expand Down Expand Up @@ -200,7 +207,7 @@ def run_with_snakemake(
'analysis_parameters': analysisParameters
}
try:
requests.post('http://merlin.georgeemanuel.com/post',
requests.post(reportPath,
files={'file': ('.'.join(
[dataSet.dataSetName,
str(reportTime)])
Expand Down
10 changes: 7 additions & 3 deletions merlin/util/dataportal.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ def __init__(self, basePath: str, **kwargs):
self._s3 = boto3.resource('s3', **kwargs)

def is_available(self):
objects = list(self._s3.Bucket(self._bucketName).objects.limit(10)
.filter(Prefix=self._prefix))
objects = list(self._s3.Bucket(self._bucketName).objects
.filter(Prefix=self._prefix).limit(10))
return len(objects) > 0

def open_file(self, fileName):
Expand Down Expand Up @@ -256,6 +256,9 @@ class LocalFilePortal(FilePortal):

def __init__(self, fileName: str):
super().__init__(fileName)
self._fileHandle = None
if not os.path.exists(fileName):
raise FileNotFoundError
self._fileHandle = open(fileName, 'rb')

def get_sibling_with_extension(self, newExtension: str):
Expand All @@ -273,7 +276,8 @@ def read_file_bytes(self, startByte, endByte):
return self._fileHandle.read(endByte-startByte)

def close(self) -> None:
self._fileHandle.close()
if self._fileHandle is not None:
self._fileHandle.close()


class S3FilePortal(FilePortal):
Expand Down