-
Notifications
You must be signed in to change notification settings - Fork 32
Organization updates #66
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: v0.1.7
Are you sure you want to change the base?
Changes from all commits
86b4dcf
35a4faf
5fce49e
0f3a617
a11a763
c526d12
d37b4c9
6c9e580
b983606
96a0382
65e51ba
07ffe6b
acc56ba
e83ceb2
9b5f66e
595136c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,9 +4,11 @@ | |
| from typing import Tuple | ||
| import pandas | ||
| import numpy as np | ||
| from io import StringIO | ||
|
|
||
| import merlin | ||
| from merlin.core import dataset | ||
| from merlin.util import dataportal | ||
|
|
||
|
|
||
| def _parse_list(inputString: str, dtype=float): | ||
|
|
@@ -31,40 +33,59 @@ class DataOrganization(object): | |
| image files. | ||
| """ | ||
|
|
||
| def __init__(self, dataSet, filePath: str = None): | ||
| def __init__(self, dataSet, filePath: str = None, | ||
| dataPortal: dataportal.DataPortal = None): | ||
| """ | ||
| Create a new DataOrganization for the data in the specified data set. | ||
|
|
||
| If filePath is not specified, a previously stored DataOrganization | ||
| is loaded from the dataSet if it exists. If filePath is specified, | ||
| the DataOrganization at the specified filePath is loaded and | ||
| stored in the dataSet, overwriting any previously stored | ||
| DataOrganization. | ||
| The DataOrganization is located in the following search order: | ||
| i) If filePath is specified and filePath exists this file is copied | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you change this to indicate to the user that it first looks for the filepath, if it doesn't find that it falls back to looking for the filepath prepended with merlin.DATA_ORGANIZATION_HOME. It also seems like we should issue a warning or error to the user if they have something in filepath that doesn't exist in either location, in case it is able to get a dataorganiation file from one of the other two if statements. |
||
| into the data set analysis directory and used as the datorganization | ||
| ii) If dataPortal is specified and contains a file named | ||
| "dataorganization.csv", this file will be copied into the | ||
| data set analysis directory and used as the data organization. | ||
| iii) If neither filePath or dataPortal are specified, the previously | ||
| stored dataorganization is used. | ||
|
|
||
| Raises: | ||
| InputDataError: If the set of raw data is incomplete or the | ||
| format of the raw data deviates from expectations. | ||
| """ | ||
|
|
||
| self._dataSet = dataSet | ||
| self.data = None | ||
|
|
||
| if filePath is not None: | ||
| if not os.path.exists(filePath): | ||
| filePath = os.sep.join( | ||
| [merlin.DATA_ORGANIZATION_HOME, filePath]) | ||
|
|
||
| self.data = pandas.read_csv( | ||
| filePath, | ||
| converters={'frame': _parse_int_list, 'zPos': _parse_list}) | ||
| self.data['readoutName'] = self.data['readoutName'].str.strip() | ||
| self._dataSet.save_dataframe_to_csv( | ||
| self.data, 'dataorganization', index=False) | ||
|
|
||
| else: | ||
| if self.data is None and dataPortal is not None: | ||
| try: | ||
| self.data = pandas.read_csv(StringIO(dataPortal.open_file( | ||
| 'dataorganization.csv').read_as_text()), | ||
| converters={'frame': _parse_int_list, 'zPos': _parse_list}) | ||
| # this could be many different exceptions so for now it can remain | ||
| # broad. If data can't be loaded from the data portal we load it | ||
| # from the dataset before | ||
| except Exception: | ||
| pass | ||
|
|
||
| if self.data is None: | ||
| self.data = self._dataSet.load_dataframe_from_csv( | ||
| 'dataorganization', | ||
| converters={'frame': _parse_int_list, 'zPos': _parse_list}) | ||
|
|
||
| self.data['readoutName'] = self.data['readoutName'].str.strip() | ||
| try: | ||
| self._dataSet.save_dataframe_to_csv( | ||
| self.data, 'dataorganization', index=False) | ||
| except PermissionError as e: | ||
| print('Unable to save data organization.') | ||
|
|
||
| stringColumns = ['readoutName', 'channelName', 'imageType', | ||
| 'imageRegExp', 'fiducialImageType', 'fiducialRegExp'] | ||
| self.data[stringColumns] = self.data[stringColumns].astype('str') | ||
|
|
@@ -256,8 +277,7 @@ def _get_image_path( | |
| (self.fileMap['fov'] == fov) & | ||
| (self.fileMap['imagingRound'] == imagingRound)] | ||
| filemapPath = selection['imagePath'].values[0] | ||
| return os.path.join(self._dataSet.dataHome, self._dataSet.dataSetName, | ||
| filemapPath) | ||
| return os.path.join(self._dataSet.imageDataPath, filemapPath) | ||
|
|
||
| def _truncate_file_path(self, path) -> None: | ||
| head, tail = os.path.split(path) | ||
|
|
@@ -283,7 +303,7 @@ def _map_image_files(self) -> None: | |
| fileNames = self._dataSet.get_image_file_names() | ||
| if len(fileNames) == 0: | ||
| raise dataset.DataFormatException( | ||
| 'No image files found at %s.' % self._dataSet.rawDataPath) | ||
| 'No image files found at %s.' % self._dataSet.imageDataPath) | ||
| fileData = [] | ||
| for currentType, currentIndex in zip(uniqueTypes, uniqueIndexes): | ||
| matchRE = re.compile( | ||
|
|
@@ -343,7 +363,7 @@ def _validate_file_map(self) -> None: | |
| (channelInfo['imageType'], fov, | ||
| channelInfo['imagingRound'])) | ||
|
|
||
| if not self._dataSet.rawDataPortal.open_file( | ||
| if not self._dataSet.imageDataPortal.open_file( | ||
| imagePath).exists(): | ||
| raise InputDataError( | ||
| ('Image data for channel {0} and fov {1} not found. ' | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,9 +56,10 @@ def build_parser(): | |
| help='the analysis home directory') | ||
| parser.add_argument('-k', '--snakemake-parameters', | ||
| help='the name of the snakemake parameters file') | ||
| parser.add_argument('--no_report', | ||
| help='flag indicating that the snakemake stats ' + | ||
| 'should not be shared to improve MERlin') | ||
| parser.add_argument('--report-path', | ||
| help='The path to send a report of the MERlin run to.' + | ||
| 'If no report path is specified, no MERlin run ' + | ||
| 'information is shared.') | ||
|
|
||
| return parser | ||
|
|
||
|
|
@@ -144,7 +145,7 @@ def merlin(): | |
| snakemakeParameters = json.load(f) | ||
|
|
||
| run_with_snakemake(dataSet, snakefilePath, args.core_count, | ||
| snakemakeParameters, not args.no_report) | ||
| snakemakeParameters, args.report_path) | ||
|
|
||
|
|
||
| def generate_analysis_tasks_and_snakefile(dataSet: dataset.MERFISHDataSet, | ||
|
|
@@ -160,18 +161,24 @@ def generate_analysis_tasks_and_snakefile(dataSet: dataset.MERFISHDataSet, | |
|
|
||
| def run_with_snakemake( | ||
| dataSet: dataset.MERFISHDataSet, snakefilePath: str, coreCount: int, | ||
| snakemakeParameters: Dict = {}, report: bool = True): | ||
| snakemakeParameters: Dict = {}, reportPath: str = None): | ||
| print('Running MERlin pipeline through snakemake') | ||
|
|
||
| if 'restart_times' not in snakemakeParameters: | ||
| snakemakeParameters['restart_times'] = 3 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I worry that the typical user would not expect this behavior and might not understand why they have one job restarting itself. Could we log this somewhere the user might expect to find this type of information? |
||
| if 'latency_wait' not in snakemakeParameters: | ||
| snakemakeParameters['latency_wait'] = 60 | ||
|
|
||
| snakemake.snakemake(snakefilePath, cores=coreCount, | ||
| workdir=dataSet.get_snakemake_path(), | ||
| stats=snakefilePath + '.stats', lock=False, | ||
| **snakemakeParameters) | ||
|
|
||
| if report: | ||
| if reportPath: | ||
| reportTime = int(time.time()) | ||
| try: | ||
| with open(snakefilePath + '.stats', 'r') as f: | ||
| requests.post('http://merlin.georgeemanuel.com/post', | ||
| requests.post(reportPath, | ||
| files={ | ||
| 'file': ( | ||
| '.'.join( | ||
|
|
@@ -200,7 +207,7 @@ def run_with_snakemake( | |
| 'analysis_parameters': analysisParameters | ||
| } | ||
| try: | ||
| requests.post('http://merlin.georgeemanuel.com/post', | ||
| requests.post(reportPath, | ||
| files={'file': ('.'.join( | ||
| [dataSet.dataSetName, | ||
| str(reportTime)]) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should there be a way to add a dataportal when constructing an analysis folder with merlin.py?