From 25408c63b76be3df2e806507a23f0d68db68b4ac Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Tue, 3 Dec 2019 10:56:00 -0500 Subject: [PATCH 01/66] Increament version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9b8e0f51..75e1d7f7 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setuptools.setup( name="merlin", - version="0.1.2", + version="0.1.3", description="MERFISH decoding software", author="George Emanuel", author_email="emanuega0@gmail.com", From 450ab95a3e9d12cec4a8a926030df95da852a696 Mon Sep 17 00:00:00 2001 From: timblosser Date: Thu, 5 Dec 2019 15:36:18 -0500 Subject: [PATCH 02/66] Added ability to put text labels on the mosaic to indicate the fov. (#26) --- CHANGELOG.md | 4 ++++ docs/tasks.rst | 2 +- merlin/analysis/generatemosaic.py | 8 ++++++++ merlin/data/dataorganization.py | 4 ++-- 4 files changed, 15 insertions(+), 3 deletions(-) mode change 100644 => 100755 merlin/analysis/generatemosaic.py diff --git a/CHANGELOG.md b/CHANGELOG.md index cb161f71..abea84c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,3 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Exposed tolerance parameter in the adaptive filter barcodes method - Added plot for scale factor magnitude vs bit index - Fixed barcode partitioning to include cells from adjacent fields of view when a cell falls across fov boundaries + +## [0.1.3] - 2019-12-03 +### Added +- Added option to draw field of view labels overlaid on the mosaic diff --git a/docs/tasks.rst b/docs/tasks.rst index cc48416b..c4270d8f 100644 --- a/docs/tasks.rst +++ b/docs/tasks.rst @@ -119,7 +119,7 @@ Parameters: * data\_channels -- The names of the data channels to export, corresponding to the data organization. If not provided, all data channels are exported. * z\_indexes -- The z index to export. If not provided all z indexes are exported. * fov\_crop\_width -- The number of pixels to remove from each edge of each fov before inserting it into the mosaic. - +* draw\_fov\_labels -- Flag indicating if the fov index should be drawn on top of each fov in the mosaic sequential.SumSignal ------------------------------- diff --git a/merlin/analysis/generatemosaic.py b/merlin/analysis/generatemosaic.py old mode 100644 new mode 100755 index b1cfdbf2..430ea50b --- a/merlin/analysis/generatemosaic.py +++ b/merlin/analysis/generatemosaic.py @@ -24,6 +24,8 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self.parameters['fov_crop_width'] = 0 if 'separate_files' not in self.parameters: self.parameters['separate_files'] = False + if 'draw_fov_labels' not in self.parameters: + self.parameters['draw_fov_labels'] = False if self.parameters['microns_per_pixel'] == 'full_resolution': self.mosaicMicronsPerPixel = self.dataSet.get_microns_per_pixel() @@ -165,6 +167,12 @@ def _prepare_mosaic_slice(self, zIndex, dataChannel, micronExtents, inputImage[:, :cropWidth] = 0 inputImage[:, inputImage.shape[0] - cropWidth:] = 0 + if self.parameters['draw_fov_labels']: + inputImage = cv2.putText(inputImage, str(f), + (int(0.2*inputImage.shape[0]), + int(0.2*inputImage.shape[1])), + 0, 10, (65000, 65000, 65000), 20) + transformedImage = self._transform_image_to_mosaic( inputImage, f, alignTask, micronExtents, mosaicDimensions) diff --git a/merlin/data/dataorganization.py b/merlin/data/dataorganization.py index f8033e62..28bab4f1 100755 --- a/merlin/data/dataorganization.py +++ b/merlin/data/dataorganization.py @@ -113,8 +113,8 @@ def get_data_channel_index(self, dataChannelName: str) -> int: # is not found """ return self.data[self.data['channelName'].apply( - lambda x: str(x).lower()).str.match( - dataChannelName.lower())].index.values.tolist()[0] + lambda x: str(x).lower()) == str(dataChannelName).lower()]\ + .index.values.tolist()[0] def get_data_channel_color(self, dataChannel: int) -> str: """Get the color used for measuring the specified data channel. From 5ba40d8903cdeeba8a160eba2e5afeb199ae5ed7 Mon Sep 17 00:00:00 2001 From: seichhorn Date: Thu, 5 Dec 2019 15:29:33 -0600 Subject: [PATCH 03/66] Assign cell fix (#32) --- CHANGELOG.md | 5 +- docs/tasks.rst | 9 +- merlin/analysis/partition.py | 7 +- merlin/analysis/segment.py | 212 ++++++---------- merlin/util/spatialfeature.py | 234 ++++++++++++++++++ .../test_analysis_parameters.json | 17 +- test/test_spatialfeature.py | 57 +++++ 7 files changed, 399 insertions(+), 142 deletions(-) mode change 100644 => 100755 test/auxiliary_files/test_analysis_parameters.json mode change 100644 => 100755 test/test_spatialfeature.py diff --git a/CHANGELOG.md b/CHANGELOG.md index abea84c5..ac65e86f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added plot for scale factor magnitude vs bit index - Fixed barcode partitioning to include cells from adjacent fields of view when a cell falls across fov boundaries -## [0.1.3] - 2019-12-03 +## [0.1.3] - 2019-12-04 +### Fixed +- Addressed bugs present in cleaning overlapping cells and assigning them to a fov ### Added - Added option to draw field of view labels overlaid on the mosaic + diff --git a/docs/tasks.rst b/docs/tasks.rst index c4270d8f..ffc1c7d5 100644 --- a/docs/tasks.rst +++ b/docs/tasks.rst @@ -98,10 +98,15 @@ Parameters: * seed\_channel\_name -- The name of the data channel to use to find seeds * watershed\_channel\_name -- The name of the data channel to use as the watershed image.W -segment.AssignCellFOV +segment.CleanCellBoundaries -------------------------------- -Description: Assigns each cell to the FOV centroid they are closest to, and eliminates overlapping cells from the dataset, keeping 1. +Description: Assigns each cell to the FOV centroid they are closest to, and eliminates overlapping cells, preferentially removing cells that overlap with the largest number of other cells until there is no more overlap in a given group of cells. + +segment.RefineCellDatabases +-------------------------------- + +Description: Creates a new cell database based on an initial cell database and a set of cells to keep. segment.ExportCellMetadata -------------------------------- diff --git a/merlin/analysis/partition.py b/merlin/analysis/partition.py index 7e183656..cdbd1003 100755 --- a/merlin/analysis/partition.py +++ b/merlin/analysis/partition.py @@ -25,7 +25,8 @@ def get_estimated_time(self): def get_dependencies(self): return [self.parameters['filter_task'], - self.parameters['assignment_task']] + self.parameters['assignment_task'], + self.parameters['cleaning_task']] def get_partitioned_barcodes(self, fov: int = None) -> pandas.DataFrame: """Retrieve the cell by barcode matrixes calculated from this @@ -52,8 +53,10 @@ def _run_analysis(self, fragmentIndex): self.parameters['filter_task']) assignmentTask = self.dataSet.load_analysis_task( self.parameters['assignment_task']) + cleaningTask = self.dataSet.load_analysis_task( + self.parameters['cleaning_task']) - tiledPos, fovBoxes = assignmentTask._get_fov_boxes() + fovBoxes = cleaningTask.get_fov_boxes() fovIntersections = sorted([i for i, x in enumerate(fovBoxes) if fovBoxes[fragmentIndex].intersects(x)]) diff --git a/merlin/analysis/segment.py b/merlin/analysis/segment.py index fa6f28c2..72213358 100755 --- a/merlin/analysis/segment.py +++ b/merlin/analysis/segment.py @@ -12,6 +12,7 @@ from merlin.util import spatialfeature from merlin.util import watershed import pandas +import networkx as nx class FeatureSavingAnalysisTask(analysistask.ParallelAnalysisTask): @@ -119,8 +120,14 @@ def _read_and_filter_image_stack(self, fov: int, channelIndex: int, for z in range(len(self.dataSet.get_z_positions()))]) -class AssignCellFOV(FeatureSavingAnalysisTask): - +class CleanCellBoundaries(analysistask.AnalysisTask): + ''' + A task to construct a network graph where each cell is a node, and overlaps + are represented by edges. This graph is then refined to assign cells to the + fov they are closest to (in terms of centroid). This graph is then refined + to eliminate overlapping cells to leave a single cell occupying a given + position. + ''' def __init__(self, dataSet, parameters=None, analysisName=None): super().__init__(dataSet, parameters, analysisName) @@ -129,13 +136,6 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self.alignTask = self.dataSet.load_analysis_task( self.parameters['global_align_task']) - def fragment_count(self): - return 1 - - def _reset_analysis(self, fragmentIndex: int = None) -> None: - super()._reset_analysis(fragmentIndex) - self.get_feature_database().empty_database() - def get_estimated_memory(self): return 2048 @@ -146,19 +146,16 @@ def get_dependencies(self): return [self.parameters['segment_task'], self.parameters['global_align_task']] - def _get_fov_boxes(self): + def get_fov_boxes(self): allFOVs = self.dataSet.get_fovs() coords = [self.alignTask.fov_global_extent(f) for f in allFOVs] coordsDF = pandas.DataFrame(coords, columns=['minx', 'miny', 'maxx', 'maxy'], index=allFOVs) - coordsDF['centerX'] = (coordsDF['minx'] + coordsDF['maxx']) / 2 - coordsDF['centerY'] = (coordsDF['miny'] + coordsDF['maxy']) / 2 - boxes = [geometry.box(x[0], x[1], x[2], x[3]) for x in coordsDF.loc[:, ['minx', 'miny', 'maxx', 'maxy']].values] - return coordsDF, boxes + return boxes def _construct_fov_tree(self, tiledPositions: pandas.DataFrame, fovIntersections: List): @@ -168,138 +165,87 @@ def _construct_fov_tree(self, tiledPositions: pandas.DataFrame, def _intial_clean(self, currentFOV: int): currentCells = self.segmentTask.get_feature_database()\ .read_features(currentFOV) - return [cell for cell in currentCells + return [cell for cell in currentCells if len(cell.get_bounding_box()) == 4 and cell.get_volume() > 0] - def _secondary_assignments(self, currentFOV: int, - secondaryAssignmentDict: Dict): - currentCells = self._intial_clean(currentFOV) - - secondaryAssignments = [ - [x, secondaryAssignmentDict[currentFOV][x.get_feature_id()]] - for x in currentCells - if x.get_feature_id() - in secondaryAssignmentDict[currentFOV]] - return secondaryAssignments - def _append_cells_to_spatial_tree(self, tree: rtree.index.Index, cells: List, idToNum: Dict): for element in cells: tree.insert(idToNum[element.get_feature_id()], - element.get_bounding_box(), obj=element.get_fov()) + element.get_bounding_box(), obj=element) - def _run_analysis(self, fragmentIndex): + def return_exported_data(self): + kwargs = {'index_col': 0} + return self.dataSet.load_dataframe_from_csv( + 'cleanedcells', analysisTask=self.analysisName, **kwargs) - featureDB = self.get_feature_database() + def _run_analysis(self) -> None: - spatialIndex = rtree.index.Index() - allFOVs = self.dataSet.get_fovs() - tiledPositions, allFOVBoxes = self._get_fov_boxes() - numToID = dict() + spatialTree = rtree.index.Index() + count = 0 idToNum = dict() - currentID = 0 - for currentFOV in allFOVs: - currentUnassigned = self._intial_clean(currentFOV) - for i in range(len(currentUnassigned)): - numToID[currentID] = currentUnassigned[i].get_feature_id() - idToNum[currentUnassigned[i].get_feature_id()] = currentID - currentID += 1 - self._append_cells_to_spatial_tree( - spatialIndex, currentUnassigned, idToNum) - - newFOVAssignments = {f: dict() for f in allFOVs} + allFOVs = self.dataSet.get_fovs() for currentFOV in allFOVs: - fovIntersections = sorted([i for i, x in enumerate(allFOVBoxes) if - allFOVBoxes[currentFOV].intersects(x)]) - fovTree = self._construct_fov_tree(tiledPositions, fovIntersections) - currentCells = self._intial_clean(currentFOV) - for cell in currentCells: - overlappingCells = spatialIndex.intersection( - cell.get_bounding_box(), objects=True) - toCheck = [] - for c in overlappingCells: - xmin, ymin, xmax, ymax = c.bbox - toCheck.append([numToID[c.id], - c.object, - xmin, ymin, xmax, ymax]) - if len(toCheck) == 0: - raise Exception(('Missing {} from spatial tree. Spatial ' + - 'tree must be malformed.').format( - cell.get_feature_id())) - else: - # If a cell does not overlap another cell, - # keep it and assign it to an fov based on the fov centroid - # it's closest to - if len(toCheck) == 1 and \ - cell.get_feature_id() == toCheck[0][0]: - xCenter = (toCheck[0][2] + toCheck[0][4]) / 2 - yCenter = (toCheck[0][3] + toCheck[0][5]) / 2 - [d, i] = fovTree.query(np.array([xCenter, yCenter])) - assignedFOV = tiledPositions \ - .loc[fovIntersections, :] \ - .index.values.tolist()[i] - newFOVAssignments[currentFOV][toCheck[0][0]] = \ - assignedFOV - # I dont know if this case will come up but want to guard - # against this, it implies there was an error - # in the original tree construction - elif len(toCheck) == 1 and not \ - cell.get_feature_id() == toCheck[0][0]: - print('encountered unexpected overlap between {} and {}' - .format(cell.get_feature_id(), toCheck[0][0])) - # If a cell overlaps at least one other cell first check if - # all cells are closest to a particular FOV centroid, - # then if a cell boundary was determined from that fov - # keep it, else choose a cell at random. If overlapping - # cells are closest to different fov centroids, - # again keep one at random - elif len(toCheck) >= 2: - toCheckDF = pandas.DataFrame(toCheck, - columns=['cell ID', - 'initial FOV', - 'xmin', 'ymin', - 'xmax', 'ymax']) - toCheckDF['centerX'] = (toCheckDF.loc[:, 'xmin'] + - toCheckDF.loc[:, 'xmax']) / 2 - toCheckDF['centerY'] = (toCheckDF.loc[:, 'ymin'] + - toCheckDF.loc[:, 'ymax']) / 2 - [d, i] = fovTree.query(toCheckDF.loc[:, ['centerX', - 'centerY']], - k=1) - assignedFOV = np.array(tiledPositions - .loc[fovIntersections, :] - .index.values.tolist())[i] - toCheckDF['assigned FOV'] = assignedFOV - if len(np.unique(assignedFOV)) == 1: - if len(toCheckDF[toCheckDF['initial FOV'] == - np.unique(assignedFOV)[0]]) > 0: - selected = toCheckDF[ - toCheckDF['initial FOV'] == np.unique( - assignedFOV)[0]].sample(n=1) - else: - selected = toCheckDF.sample(n=1) - else: - matching = toCheckDF[toCheckDF['initial FOV'] == - toCheckDF['assigned FOV']] - if len(matching) == 0: - selected = toCheckDF.sample(n=1) - else: - selected = matching.sample(n=1) - selectedFOV = selected.loc[:, 'assigned FOV'] \ - .values.tolist()[0] - selectedCellID = selected.loc[:, 'cell ID'] \ - .values.tolist()[0] - newFOVAssignments[ - currentFOV][selectedCellID] = selectedFOV + cells = self.segmentTask.get_feature_database()\ + .read_features(currentFOV) + cells = spatialfeature.simple_clean_cells(cells) + spatialTree, count, idToNum = spatialfeature.construct_tree( + cells, spatialTree, count, idToNum) + + fovBoxes = self.get_fov_boxes() + graph = nx.Graph() for currentFOV in allFOVs: - secondaryCellAssignments = \ - self._secondary_assignments(currentFOV, newFOVAssignments) - allFOV = list(set([x[1] for x in secondaryCellAssignments])) - for f in allFOV: - cellsInFOV = [x[0] for x in secondaryCellAssignments - if x[1] == f] - featureDB.write_features(cellsInFOV, f) + cells = self.segmentTask.get_feature_database()\ + .read_features(currentFOV) + cells = spatialfeature.simple_clean_cells(cells) + graph = spatialfeature.construct_graph(graph, cells, + spatialTree, currentFOV, + allFOVs, fovBoxes) + + cleanedCells = spatialfeature.remove_overlapping_cells(graph) + + self.dataSet.save_dataframe_to_csv(cleanedCells, 'cleanedcells', + analysisTask=self) + + +class RefineCellDatabases(FeatureSavingAnalysisTask): + def __init__(self, dataSet, parameters=None, analysisName=None): + super().__init__(dataSet, parameters, analysisName) + + self.segmentTask = self.dataSet.load_analysis_task( + self.parameters['segment_task']) + self.cleaningTask = self.dataSet.load_analysis_task( + self.parameters['cleaning_task']) + + def fragment_count(self): + return len(self.dataSet.get_fovs()) + + def get_estimated_memory(self): + # TODO - refine estimate + return 2048 + + def get_estimated_time(self): + # TODO - refine estimate + return 5 + + def get_dependencies(self): + return [self.parameters['segment_task'], + self.parameters['cleaning_task']] + + def _run_analysis(self, fragmentIndex): + + cleanedCells = self.cleaningTask.return_exported_data() + originalCells = self.segmentTask.get_feature_database()\ + .read_features(fragmentIndex) + featureDB = self.get_feature_database() + cleanedC = cleanedCells[cleanedCells['originalFOV'] == fragmentIndex] + cleanedGroups = cleanedC.groupby('assignedFOV') + for k, g in cleanedGroups: + cellsToConsider = g['cell_id'].values.tolist() + featureList = [x for x in originalCells if + str(x.get_feature_id()) in cellsToConsider] + featureDB.write_features(featureList, fragmentIndex) class ExportCellMetadata(analysistask.AnalysisTask): diff --git a/merlin/util/spatialfeature.py b/merlin/util/spatialfeature.py index 6952d699..10de7f0a 100755 --- a/merlin/util/spatialfeature.py +++ b/merlin/util/spatialfeature.py @@ -10,6 +10,10 @@ import h5py import merlin import pandas +import networkx as nx +import rtree +from scipy.spatial import cKDTree + from merlin.core import dataset @@ -299,6 +303,35 @@ def contains_positions(self, positionList: np.ndarray) -> np.ndarray: return containmentList + def get_overlapping_features(self, featuresToCheck: List['SpatialFeature'] + ) -> List['SpatialFeature']: + """ Determine which features within the provided list overlap with this + feature. + + Args: + featuresToCheck: the list of features to check for overlap with + this feature. + Returns: the features that overlap with this feature + """ + areas = [self.intersection(x) for x in featuresToCheck] + overlapping = [featuresToCheck[i] for i, x in enumerate(areas) if x > 0] + benchmark = self.intersection(self) + contained = [x for x in overlapping if + x.intersection(self) == benchmark] + if len(contained) > 1: + overlapping = [] + else: + toReturn = [] + for c in overlapping: + if c.get_feature_id() == self.get_feature_id(): + toReturn.append(c) + else: + if c.intersection(self) != c.intersection(c): + toReturn.append(c) + overlapping = toReturn + + return overlapping + def to_json_dict(self) -> Dict: return { 'fov': self._fov, @@ -585,3 +618,204 @@ def _extract_feature_metadata(feature: SpatialFeature) -> Dict: 'bounds_x2': boundingBox[2], 'bounds_y2': boundingBox[3], 'volume': feature.get_volume()} + + +def simple_clean_cells(cells: List) -> List: + """ + Removes cells that lack a bounding box or have a volume equal to 0 + + Args: + cells: List of spatial features + + Returns: + List of spatial features + + """ + return [cell for cell in cells + if len(cell.get_bounding_box()) == 4 and cell.get_volume() > 0] + + +def append_cells_to_spatial_tree(tree: rtree.index.Index, + cells: List, idToNum: Dict): + for element in cells: + tree.insert(idToNum[element.get_feature_id()], + element.get_bounding_box(), obj=element) + + +def construct_tree(cells: List, + spatialIndex: rtree.index.Index = rtree.index.Index(), + count: int = 0, idToNum: Dict = dict()): + """ + Builds or adds to an rtree with a list of cells + + Args: + cells: list of spatial features + spatialIndex: an existing rtree to append to + count: number of existing entries in existing rtree + idToNum: dict containing feature ID as key, and number in rtree as value + + Returns: + spatialIndex: an rtree updated with the input cells + count: number of entries in rtree + idToNum: dict containing feature ID as key, and number in rtree as value + """ + + for i in range(len(cells)): + idToNum[cells[i].get_feature_id()] = count + count += 1 + append_cells_to_spatial_tree(spatialIndex, cells, idToNum) + + return spatialIndex, count, idToNum + + +def return_overlapping_cells(currentCell, cells: List): + """ + Determines if there is overlap between a cell of interest and a list of + other cells. In the event that the cell of interest is entirely contained + within one of the cells in the cells it is being compared to, an empty + list is returned. Otherwise, the cell of interest and any overlapping + cells are returned. + Args: + currentCell: A spatial feature of interest + cells: A list of spatial features to compare to, the spatial feature + of interest is expected to be in this list + + Returns: + A list of spatial features including the cell of interest and all + overlapping cells, or an empty list if the cell of intereset is + entirely contained within one of the cells it is compared to + """ + areas = [currentCell.intersection(x) for x in cells] + overlapping = [cells[i] for i, x in enumerate(areas) if x > 0] + benchmark = currentCell.intersection(currentCell) + contained = [x for x in overlapping if + x.intersection(currentCell) == benchmark] + if len(contained) > 1: + overlapping = [] + else: + toReturn = [] + for c in overlapping: + if c.get_feature_id() == currentCell.get_feature_id(): + toReturn.append(c) + else: + if c.intersection(currentCell) != c.intersection(c): + toReturn.append(c) + overlapping = toReturn + + return overlapping + + +def construct_graph(graph, cells, spatialTree, currentFOV, allFOVs, fovBoxes): + """ + Adds the cells from the current fov to a graph where each node is a cell + and edges connect overlapping cells. + + Args: + graph: An undirected graph, either empty of already containing cells + cells: A list of spatial features to potentially add to graph + spatialTree: an rtree index containing each cell in the dataset + currentFOV: the fov currently being added to the graph + allFOVs: a list of all fovs in the dataset + fovBoxes: a list of shapely polygons containing the bounds of each fov + + Returns: + A graph updated to include cells from the current fov + """ + + fovIntersections = sorted([i for i, x in enumerate(fovBoxes) if + fovBoxes[currentFOV].intersects(x)]) + + coords = [x.centroid.coords.xy for x in fovBoxes] + xcoords = [x[0][0] for x in coords] + ycoords = [x[1][0] for x in coords] + coordsDF = pandas.DataFrame(data=np.array(list(zip(xcoords, ycoords))), + index=allFOVs, + columns=['centerX', 'centerY']) + fovTree = cKDTree(data=coordsDF.loc[fovIntersections, + ['centerX', 'centerY']].values) + for cell in cells: + overlappingCells = spatialTree.intersection( + cell.get_bounding_box(), objects=True) + toCheck = [x.object for x in overlappingCells] + cellsToConsider = return_overlapping_cells( + cell, toCheck) + if len(cellsToConsider) == 0: + pass + else: + for cellToConsider in cellsToConsider: + xmin, ymin, xmax, ymax =\ + cellToConsider.get_bounding_box() + xCenter = (xmin + xmax) / 2 + yCenter = (ymin + ymax) / 2 + [d, i] = fovTree.query(np.array([xCenter, yCenter])) + assignedFOV = coordsDF.loc[fovIntersections, :]\ + .index.values.tolist()[i] + if cellToConsider.get_feature_id() not in graph.nodes: + graph.add_node(cellToConsider.get_feature_id(), + originalFOV=cellToConsider.get_fov(), + assignedFOV=assignedFOV) + if len(cellsToConsider) > 1: + for cellToConsider1 in cellsToConsider: + if cellToConsider1.get_feature_id() !=\ + cell.get_feature_id(): + graph.add_edge(cell.get_feature_id(), + cellToConsider1.get_feature_id()) + return graph + + +def remove_overlapping_cells(graph): + """ + Takes in a graph in which each node is a cell and edges connect cells that + overlap eachother in space. Removes overlapping cells, preferentially + eliminating the cell that overlaps the most cells (i.e. if cell A overlaps + cells B, C, and D, whereas cell B only overlaps cell A, cell C only overlaps + cell A, and cell D only overlaps cell A, then cell A will be removed, + leaving cells B, C, and D remaining because there is no more overlap + within this group of cells). + Args: + graph: An undirected graph, in which each node is a cell and each + edge connects overlapping cells. nodes are expected to have + the following attributes: originalFOV, assignedFOV + Returns: + A pandas dataframe containing the feature ID of all cells after removing + all instances of overlap. There are columns for cell_id, originalFOV, + and assignedFOV + """ + connectedComponents = list(nx.connected_components(graph)) + cleanedCells = [] + connectedComponents = [list(x) for x in connectedComponents] + for component in connectedComponents: + if len(component) == 1: + originalFOV = graph.nodes[component[0]]['originalFOV'] + assignedFOV = graph.nodes[component[0]]['assignedFOV'] + cleanedCells.append([component[0], originalFOV, assignedFOV]) + if len(component) > 1: + sg = nx.subgraph(graph, component) + verts = list(nx.articulation_points(sg)) + if len(verts) > 0: + sg = nx.subgraph(graph, + [x for x in component if x not in verts]) + allEdges = [[k, v] for k, v in nx.degree(sg)] + sortedEdges = sorted(allEdges, key=lambda x: x[1], reverse=True) + maxEdges = sortedEdges[0][1] + while maxEdges > 0: + sg = nx.subgraph(graph, [x[0] for x in sortedEdges[1:]]) + allEdges = [[k, v] for k, v in nx.degree(sg)] + sortedEdges = sorted(allEdges, key=lambda x: x[1], + reverse=True) + maxEdges = sortedEdges[0][1] + keptComponents = list(sg.nodes()) + cellIDs = [] + originalFOVs = [] + assignedFOVs = [] + for c in keptComponents: + cellIDs.append(c) + originalFOVs.append(graph.nodes[c]['originalFOV']) + assignedFOVs.append(graph.nodes[c]['assignedFOV']) + listOfLists = list(zip(cellIDs, originalFOVs, assignedFOVs)) + listOfLists = [list(x) for x in listOfLists] + cleanedCells = cleanedCells + listOfLists + cleanedCellsDF = pandas.DataFrame(cleanedCells, + columns=['cell_id', 'originalFOV', + 'assignedFOV']) + return cleanedCellsDF diff --git a/test/auxiliary_files/test_analysis_parameters.json b/test/auxiliary_files/test_analysis_parameters.json old mode 100644 new mode 100755 index 881663fa..00e0baa5 --- a/test/auxiliary_files/test_analysis_parameters.json +++ b/test/auxiliary_files/test_analysis_parameters.json @@ -113,19 +113,28 @@ } }, { - "task": "AssignCellFOV", + "task": "CleanCellBoundaries", "module": "merlin.analysis.segment", "parameters": { "segment_task": "WatershedSegment", "global_align_task": "SimpleGlobalAlignment" } }, + { + "task": "RefineCellDatabases", + "module": "merlin.analysis.segment", + "parameters": { + "segment_task": "WatershedSegment", + "cleaning_task": "CleanCellBoundaries" + } + }, { "task": "PartitionBarcodes", "module": "merlin.analysis.partition", "parameters": { "filter_task": "AdaptiveFilterBarcodes", - "assignment_task": "AssignCellFOV" + "assignment_task": "RefineCellDatabases", + "cleaning_task": "CleanCellBoundaries" } }, { @@ -139,7 +148,7 @@ "task": "ExportCellMetadata", "module": "merlin.analysis.segment", "parameters": { - "segment_task": "AssignCellFOV" + "segment_task": "RefineCellDatabases" } }, { @@ -150,7 +159,7 @@ "apply_highpass": true, "warp_task": "FiducialCorrelationWarp", "highpass_sigma": 5, - "segment_task": "AssignCellFOV", + "segment_task": "RefineCellDatabases", "global_align_task": "SimpleGlobalAlignment" } }, diff --git a/test/test_spatialfeature.py b/test/test_spatialfeature.py old mode 100644 new mode 100755 index 19af6f6c..c0bd3d20 --- a/test/test_spatialfeature.py +++ b/test/test_spatialfeature.py @@ -1,6 +1,7 @@ import pytest import numpy as np import json +import networkx as nx from shapely import geometry from merlin.util import spatialfeature @@ -18,6 +19,18 @@ [geometry.Polygon(testCoords2)]], 0, zCoordinates=np.array([0, 0.5])) +p1 = spatialfeature.SpatialFeature( + [[geometry.Polygon([(0, 0), (0, 1), (1, 1), (1, 0)])]], 0) +p2 = spatialfeature.SpatialFeature( + [[geometry.Polygon([(0, 0.5), (0, 1), (1, 1), (1, 0.5)])]], 0) +p3 = spatialfeature.SpatialFeature( + [[geometry.Polygon([(0, 0.5), (0, 1.5), (1, 1.5), (1, 0.5)])]], 0) +p4 = spatialfeature.SpatialFeature( + [[geometry.Polygon([(0, 1), (0, 2), (1, 2), (1, 1)])]], 0) +p5 = spatialfeature.SpatialFeature( + [[geometry.Polygon([(4, 4), (4, 5), (5, 5), (5, 4)])]], 0) +allCells = [p1, p2, p3, p4, p5] + def test_feature_from_label_matrix(): testLabels = np.zeros((1, 4, 4)) @@ -193,3 +206,47 @@ def test_feature_contains_positions(): assert all([a == b for a, b in zip(feature4.contains_positions(positions2), [False, False, True, False, True, True])]) + + +def test_find_overlapping_cells(): + t1 = p1.get_overlapping_features(allCells) + t2 = p2.get_overlapping_features(allCells) + t3 = p3.get_overlapping_features(allCells) + t4 = p4.get_overlapping_features(allCells) + t5 = p5.get_overlapping_features(allCells) + + assert ((p1 in t1) and (p3 in t1) + and (p2 not in t1) and (p5 not in t1) and (p5 not in t1)) + assert len(t2) == 0 + assert ((p3 in t3) and (p1 in t3) and (p4 in t3) + and (p2 not in t3) and (p5 not in t3)) + assert ((p4 in t4) and (p3 in t4) and + (p1 not in t4) and (p2 not in t4) and (p5 not in t4)) + assert ((p5 in t5) and (p1 not in t5) + and (p2 not in t5) and (p3 not in t5) and (p4 not in t5)) + + +def test_remove_overlapping_cells(): + allFOVs = [0,1] + fovBoxes = [geometry.box(-1, -1, 2, 2), geometry.box(2, 2, 6, 6)] + currentFOV = 0 + cells = spatialfeature.simple_clean_cells(allCells) + + spatialIndex, _, _ = spatialfeature.construct_tree(cells) + + G = nx.Graph() + G = spatialfeature.construct_graph(G, cells, spatialIndex, + currentFOV, allFOVs, fovBoxes) + + cleanedCellsDF = spatialfeature.remove_overlapping_cells(G) + keptCells = cleanedCellsDF['cell_id'].values.tolist() + + assert G.nodes[p5.get_feature_id()]['originalFOV'] == 0 + assert G.nodes[p5.get_feature_id()]['assignedFOV'] == 1 + assert G.nodes[p1.get_feature_id()]['originalFOV'] == 0 + assert G.nodes[p1.get_feature_id()]['assignedFOV'] == 0 + assert p1.get_feature_id() in keptCells + assert p4.get_feature_id() in keptCells + assert p5.get_feature_id() in keptCells + assert p2.get_feature_id() not in keptCells + assert p3.get_feature_id() not in keptCells From 60f4ca9f5dd24f5761b9bc1395bd4957c4061b17 Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Fri, 6 Dec 2019 23:24:42 -0500 Subject: [PATCH 04/66] Increment version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 75e1d7f7..c65fbcb8 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setuptools.setup( name="merlin", - version="0.1.3", + version="0.1.4", description="MERFISH decoding software", author="George Emanuel", author_email="emanuega0@gmail.com", From 0f0dc8316d9b84071c9f776a16415471ae7b8343 Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Sun, 8 Dec 2019 11:28:44 -0500 Subject: [PATCH 05/66] Update docs and requirements to better reflect the current version of merlin. (#36) --- docs/installation.rst | 11 +++++++++-- requirements.txt | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/installation.rst b/docs/installation.rst index 59d7334a..27cf3e48 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -58,13 +58,14 @@ MERlin can be installed by cloning the repository and installing with pip: Specifying paths with a .env file ================================== -A .env file is required to specify the search locations for the various input and output files. The following variables should be defined in a file named .env in the user home directory (~\.env on linux or C:\users\UserName\.env on Windows): +A .merlinenv file is required to specify the search locations for the various input and output files. The following variables should be defined in a file named .merlinenv in the user home directory (~\\.merlinenv on linux or C:\\users\\UserName\\.merlinenv on Windows): * DATA\_HOME - The path of the root directory to the raw data. * ANALYSIS\_HOME - The path of the root directory where analysis results should be stored. * PARAMETERS\_HOME - The path to the directory where the merfish-parameters directory resides. The PARAMETERS_HOME directory should contain the following folders: + * analysis - Contains the analysis parameters json files. * codebooks - Contains the codebook csv files. * dataorganization - Contains the data organization csv files. @@ -76,10 +77,16 @@ The PARAMETERS_HOME directory should contain the following folders: An example PARAMETERS_HOME directory with typical files can be found in the `merlin-parameters-example `_ repository. -The contents of an example .env file are below: +The contents of an example .merlinenv file are below: .. code-block:: none DATA_HOME=D:/data ANALYSIS_HOME=D:/analysis PARAMETERS_HOME=D:/merfish-parameters + +Merlin can create a .merlinenv file for you using the command: + +.. code-blocks:: none + + merlin --configure . diff --git a/requirements.txt b/requirements.txt index 094b9743..e84c4226 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ scipy>=1.2 matplotlib networkx rtree -shapely +shapely!=1.7a2 seaborn>=0.9.0 pyqt5 Sphinx From f08e0bea9775135a0683d08b5020404d5c5bc3d4 Mon Sep 17 00:00:00 2001 From: seichhorn Date: Tue, 10 Dec 2019 20:45:49 -0500 Subject: [PATCH 06/66] Parallel cleancell (#34) --- CHANGELOG.md | 3 + docs/tasks.rst | 7 +- merlin/analysis/globalalign.py | 17 +++ merlin/analysis/partition.py | 10 +- merlin/analysis/segment.py | 121 +++++++++++------- merlin/util/spatialfeature.py | 1 + .../test_analysis_parameters.json | 11 +- 7 files changed, 117 insertions(+), 53 deletions(-) mode change 100644 => 100755 CHANGELOG.md mode change 100644 => 100755 docs/tasks.rst diff --git a/CHANGELOG.md b/CHANGELOG.md old mode 100644 new mode 100755 index ac65e86f..431e6abf --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,3 +24,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Added option to draw field of view labels overlaid on the mosaic +## [0.1.4] - 2019-12-05 +### Changed +- Changed the clean overlapping cells to run in parallel \ No newline at end of file diff --git a/docs/tasks.rst b/docs/tasks.rst old mode 100644 new mode 100755 index ffc1c7d5..175a29b1 --- a/docs/tasks.rst +++ b/docs/tasks.rst @@ -101,7 +101,12 @@ Parameters: segment.CleanCellBoundaries -------------------------------- -Description: Assigns each cell to the FOV centroid they are closest to, and eliminates overlapping cells, preferentially removing cells that overlap with the largest number of other cells until there is no more overlap in a given group of cells. +Description: For a FOV of interest, this task identifies all other FOVs with any overlapping regions, and constructs a graph containing cells from the FOV of interest and all cells from either that FOV or the overlapping FOVs that overlap a cell, with edges connecting overlapping cells + +segment.CombineCleanedBoundaries +-------------------------------- + +Description: Combines the cleaned cell boundaries generated for each fov, and eliminates overlapping cells, preferentially removing cells that overlap with the largest number of other cells until there is no more overlap in a given group of cells. segment.RefineCellDatabases -------------------------------- diff --git a/merlin/analysis/globalalign.py b/merlin/analysis/globalalign.py index 8e6d6134..405a7d40 100755 --- a/merlin/analysis/globalalign.py +++ b/merlin/analysis/globalalign.py @@ -2,6 +2,7 @@ import numpy as np from typing import Tuple from typing import List +from shapely import geometry from merlin.core import analysistask @@ -13,6 +14,8 @@ class GlobalAlignment(analysistask.AnalysisTask): different field of views relative to each other in order to construct a global alignment. """ + def __init__(self, dataSet, parameters=None, analysisName=None): + super().__init__(dataSet, parameters, analysisName) @abstractmethod def fov_coordinates_to_global( @@ -73,6 +76,20 @@ def get_global_extent(self) -> Tuple[float, float, float, float]: """ pass + def get_fov_boxes(self) -> List: + """ + Creates a list of shapely boxes for each fov containing the global + coordinates as the box coordinates. + + Returns: + A list of shapely boxes + """ + fovs = self.dataSet.get_fovs() + boxes = [geometry.box(*self.fov_global_extent(f)) for f in fovs] + + return boxes + + class SimpleGlobalAlignment(GlobalAlignment): diff --git a/merlin/analysis/partition.py b/merlin/analysis/partition.py index cdbd1003..eddf697c 100755 --- a/merlin/analysis/partition.py +++ b/merlin/analysis/partition.py @@ -2,7 +2,7 @@ import numpy as np from merlin.core import analysistask - +from merlin.util import spatialfeature class PartitionBarcodes(analysistask.ParallelAnalysisTask): @@ -26,7 +26,7 @@ def get_estimated_time(self): def get_dependencies(self): return [self.parameters['filter_task'], self.parameters['assignment_task'], - self.parameters['cleaning_task']] + self.parameters['alignment_task']] def get_partitioned_barcodes(self, fov: int = None) -> pandas.DataFrame: """Retrieve the cell by barcode matrixes calculated from this @@ -53,10 +53,10 @@ def _run_analysis(self, fragmentIndex): self.parameters['filter_task']) assignmentTask = self.dataSet.load_analysis_task( self.parameters['assignment_task']) - cleaningTask = self.dataSet.load_analysis_task( - self.parameters['cleaning_task']) + alignTask = self.dataSet.load_analysis_task( + self.parameters['alignment_task']) - fovBoxes = cleaningTask.get_fov_boxes() + fovBoxes = alignTask.get_fov_boxes() fovIntersections = sorted([i for i, x in enumerate(fovBoxes) if fovBoxes[fragmentIndex].intersects(x)]) diff --git a/merlin/analysis/segment.py b/merlin/analysis/segment.py index 72213358..00014983 100755 --- a/merlin/analysis/segment.py +++ b/merlin/analysis/segment.py @@ -120,7 +120,7 @@ def _read_and_filter_image_stack(self, fov: int, channelIndex: int, for z in range(len(self.dataSet.get_z_positions()))]) -class CleanCellBoundaries(analysistask.AnalysisTask): +class CleanCellBoundaries(analysistask.ParallelAnalysisTask): ''' A task to construct a network graph where each cell is a node, and overlaps are represented by edges. This graph is then refined to assign cells to the @@ -136,6 +136,9 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self.alignTask = self.dataSet.load_analysis_task( self.parameters['global_align_task']) + def fragment_count(self): + return len(self.dataSet.get_fovs()) + def get_estimated_memory(self): return 2048 @@ -146,46 +149,34 @@ def get_dependencies(self): return [self.parameters['segment_task'], self.parameters['global_align_task']] - def get_fov_boxes(self): - allFOVs = self.dataSet.get_fovs() - coords = [self.alignTask.fov_global_extent(f) for f in allFOVs] - coordsDF = pandas.DataFrame(coords, - columns=['minx', 'miny', 'maxx', 'maxy'], - index=allFOVs) - boxes = [geometry.box(x[0], x[1], x[2], x[3]) for x in - coordsDF.loc[:, ['minx', 'miny', 'maxx', 'maxy']].values] - - return boxes - - def _construct_fov_tree(self, tiledPositions: pandas.DataFrame, - fovIntersections: List): - return cKDTree(data=tiledPositions.loc[fovIntersections, - ['centerX', 'centerY']].values) - - def _intial_clean(self, currentFOV: int): - currentCells = self.segmentTask.get_feature_database()\ - .read_features(currentFOV) - return [cell for cell in currentCells - if len(cell.get_bounding_box()) == 4 and cell.get_volume() > 0] - - def _append_cells_to_spatial_tree(self, tree: rtree.index.Index, - cells: List, idToNum: Dict): - for element in cells: - tree.insert(idToNum[element.get_feature_id()], - element.get_bounding_box(), obj=element) + def _write_graph(self, analysisResult, resultName: str, + analysisName: str, resultIndex: int = None, + subdirectory: str = None) -> None: - def return_exported_data(self): - kwargs = {'index_col': 0} - return self.dataSet.load_dataframe_from_csv( - 'cleanedcells', analysisTask=self.analysisName, **kwargs) + savePath = self.dataSet._analysis_result_save_path( + resultName, analysisName, resultIndex, subdirectory, '.gpickle') + nx.readwrite.gpickle.write_gpickle(analysisResult, savePath) - def _run_analysis(self) -> None: + def return_exported_data(self, fragmentIndex) -> nx.Graph: + + savePath = self.dataSet._analysis_result_save_path( + 'cleaned_cells', self.analysisName, fragmentIndex, None, '.gpickle') + + loadedG = nx.readwrite.gpickle.read_gpickle(savePath) + + return loadedG + + def _run_analysis(self, fragmentIndex) -> None: + allFOVs = np.array(self.dataSet.get_fovs()) + fovBoxes = self.alignTask.get_fov_boxes() + fovIntersections = sorted([i for i, x in enumerate(fovBoxes) if + fovBoxes[fragmentIndex].intersects(x)]) + intersectingFOVs = list(allFOVs[np.array(fovIntersections)]) spatialTree = rtree.index.Index() count = 0 idToNum = dict() - allFOVs = self.dataSet.get_fovs() - for currentFOV in allFOVs: + for currentFOV in intersectingFOVs: cells = self.segmentTask.get_feature_database()\ .read_features(currentFOV) cells = spatialfeature.simple_clean_cells(cells) @@ -193,19 +184,59 @@ def _run_analysis(self) -> None: spatialTree, count, idToNum = spatialfeature.construct_tree( cells, spatialTree, count, idToNum) - fovBoxes = self.get_fov_boxes() + graph = nx.Graph() + cells = self.segmentTask.get_feature_database()\ + .read_features(fragmentIndex) + cells = spatialfeature.simple_clean_cells(cells) + graph = spatialfeature.construct_graph(graph, cells, + spatialTree, fragmentIndex, + allFOVs, fovBoxes) + + self._write_graph(graph, 'cleaned_cells', + self.analysisName, fragmentIndex) + + +class CombineCleanedBoundaries(analysistask.AnalysisTask): + """ + A task to construct a network graph where each cell is a node, and overlaps + are represented by edges. This graph is then refined to assign cells to the + fov they are closest to (in terms of centroid). This graph is then refined + to eliminate overlapping cells to leave a single cell occupying a given + position. + + """ + def __init__(self, dataSet, parameters=None, analysisName=None): + super().__init__(dataSet, parameters, analysisName) + + self.cleaningTask = self.dataSet.load_analysis_task( + self.parameters['cleaning_task']) + + def get_estimated_memory(self): + # TODO - refine estimate + return 2048 + + def get_estimated_time(self): + # TODO - refine estimate + return 5 + + def get_dependencies(self): + return [self.parameters['cleaning_task']] + + def return_exported_data(self): + kwargs = {'index_col': 0} + return self.dataSet.load_dataframe_from_csv( + 'all_cleaned_cells', analysisTask=self.analysisName, **kwargs) + + def _run_analysis(self): + allFOVs = self.dataSet.get_fovs() graph = nx.Graph() for currentFOV in allFOVs: - cells = self.segmentTask.get_feature_database()\ - .read_features(currentFOV) - cells = spatialfeature.simple_clean_cells(cells) - graph = spatialfeature.construct_graph(graph, cells, - spatialTree, currentFOV, - allFOVs, fovBoxes) + subGraph = self.cleaningTask.return_exported_data(currentFOV) + graph = nx.compose(graph, subGraph) cleanedCells = spatialfeature.remove_overlapping_cells(graph) - self.dataSet.save_dataframe_to_csv(cleanedCells, 'cleanedcells', + self.dataSet.save_dataframe_to_csv(cleanedCells, 'all_cleaned_cells', analysisTask=self) @@ -216,7 +247,7 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self.segmentTask = self.dataSet.load_analysis_task( self.parameters['segment_task']) self.cleaningTask = self.dataSet.load_analysis_task( - self.parameters['cleaning_task']) + self.parameters['combine_cleaning_task']) def fragment_count(self): return len(self.dataSet.get_fovs()) @@ -231,7 +262,7 @@ def get_estimated_time(self): def get_dependencies(self): return [self.parameters['segment_task'], - self.parameters['cleaning_task']] + self.parameters['combine_cleaning_task']] def _run_analysis(self, fragmentIndex): diff --git a/merlin/util/spatialfeature.py b/merlin/util/spatialfeature.py index 10de7f0a..6720f877 100755 --- a/merlin/util/spatialfeature.py +++ b/merlin/util/spatialfeature.py @@ -16,6 +16,7 @@ from merlin.core import dataset +from merlin.core import analysistask class SpatialFeature(object): diff --git a/test/auxiliary_files/test_analysis_parameters.json b/test/auxiliary_files/test_analysis_parameters.json index 00e0baa5..4d27f2ec 100755 --- a/test/auxiliary_files/test_analysis_parameters.json +++ b/test/auxiliary_files/test_analysis_parameters.json @@ -120,12 +120,19 @@ "global_align_task": "SimpleGlobalAlignment" } }, + { + "task": "CombineCleanedBoundaries", + "module": "merlin.analysis.segment", + "parameters": { + "cleaning_task": "CleanCellBoundaries" + } + }, { "task": "RefineCellDatabases", "module": "merlin.analysis.segment", "parameters": { "segment_task": "WatershedSegment", - "cleaning_task": "CleanCellBoundaries" + "combine_cleaning_task": "CombineCleanedBoundaries" } }, { @@ -134,7 +141,7 @@ "parameters": { "filter_task": "AdaptiveFilterBarcodes", "assignment_task": "RefineCellDatabases", - "cleaning_task": "CleanCellBoundaries" + "alignment_task": "SimpleGlobalAlignment" } }, { From e3f3c959e8f5ed271fa126e82726221619bb242b Mon Sep 17 00:00:00 2001 From: seichhorn Date: Fri, 10 Jan 2020 23:17:04 -0500 Subject: [PATCH 07/66] Reduced snakemake overhead by adding analysis task to check completion of parallel analysis tasks --- CHANGELOG.md | 5 +- docs/tasks.rst | 9 +++ merlin/analysis/paralleltaskcomplete.py | 26 ++++++ merlin/core/analysistask.py | 28 ++++--- merlin/core/dataset.py | 4 +- merlin/util/snakewriter.py | 100 ++++++++++++++++-------- test/conftest.py | 0 test/test_snakemake.py | 0 8 files changed, 127 insertions(+), 45 deletions(-) create mode 100755 merlin/analysis/paralleltaskcomplete.py mode change 100644 => 100755 merlin/util/snakewriter.py mode change 100644 => 100755 test/conftest.py mode change 100644 => 100755 test/test_snakemake.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 431e6abf..da21e966 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,5 +25,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added option to draw field of view labels overlaid on the mosaic ## [0.1.4] - 2019-12-05 +### Added +- Added task to evaluate whether a parallel analysis task has completed ### Changed -- Changed the clean overlapping cells to run in parallel \ No newline at end of file +- Changed the clean overlapping cells to run in parallel +- Snakemake job inputs were simplified using the ParallelCompleteTask to improve DAG construction speed and overall snakemake runtime performance \ No newline at end of file diff --git a/docs/tasks.rst b/docs/tasks.rst index 175a29b1..44789416 100755 --- a/docs/tasks.rst +++ b/docs/tasks.rst @@ -179,3 +179,12 @@ Parameters: * sum\_task * partition\_task * global\_align\_task + +paralleltaskcomplete.ParallelTaskComplete +_________________________________________ + +Description: Check whether a parallel analysis task has completed all jobs and create a done fine for that task if so. This task does not need to be invoked by the user, it is used by the snakewriter. + +Parameters: + +* dependent\_task -- the parallel analysis task to check to see if it has completed diff --git a/merlin/analysis/paralleltaskcomplete.py b/merlin/analysis/paralleltaskcomplete.py new file mode 100755 index 00000000..f8f43835 --- /dev/null +++ b/merlin/analysis/paralleltaskcomplete.py @@ -0,0 +1,26 @@ +from merlin.core import analysistask + + +class ParallelTaskComplete(analysistask.AnalysisTask): + """ + A task to simplify snakemake construction, there is no need to explicitly + invoke this task in analysis files, it is inferred by the snakewriter. + If running outside snakemake this class is not required. + """ + def __init__(self, dataSet, parameters=None, analysisName=None): + super().__init__(dataSet, parameters, analysisName) + + def get_estimated_memory(self): + return 100 + + def get_dependencies(self): + # The dependency is inferred by the snakewriter + return [self.parameters['dependent_task']] + + def get_estimated_time(self): + return 1 + + def _run_analysis(self): + dependentTask = self.dataSet.load_analysis_task( + self.parameters['dependent_task']) + dependentTask.is_complete() diff --git a/merlin/core/analysistask.py b/merlin/core/analysistask.py index 08cf648e..571e2991 100755 --- a/merlin/core/analysistask.py +++ b/merlin/core/analysistask.py @@ -250,7 +250,7 @@ class InternallyParallelAnalysisTask(AnalysisTask): """ An abstract class for analysis that can only be run in one part, but can internally be sped up using multiple processes. Subclasses - should implement the analysis to perform in te run_analysis() function. + should implement the analysis to perform in the run_analysis() function. """ def __init__(self, dataSet, parameters=None, analysisName=None): @@ -282,10 +282,6 @@ class ParallelAnalysisTask(AnalysisTask): def __init__(self, dataSet, parameters=None, analysisName=None): super().__init__(dataSet, parameters, analysisName) - @abstractmethod - def fragment_count(self): - pass - def run(self, fragmentIndex: int=None, overwrite=True) -> None: """Run the specified index of this analysis task. @@ -322,7 +318,7 @@ def run(self, fragmentIndex: int=None, overwrite=True) -> None: self.dataSet.record_analysis_started(self, fragmentIndex) self._indicate_running(fragmentIndex) self._run_analysis(fragmentIndex) - self.dataSet.record_analysis_complete(self, fragmentIndex) + self.dataSet.record_analysis_complete(self, fragmentIndex) logger.info('Completed %s %i' % (self.get_analysis_name(), fragmentIndex)) self.dataSet.close_logger(self, fragmentIndex) @@ -332,6 +328,10 @@ def run(self, fragmentIndex: int=None, overwrite=True) -> None: self.dataSet.close_logger(self, fragmentIndex) raise e + @abstractmethod + def fragment_count(self): + pass + def _reset_analysis(self, fragmentIndex: int=None) -> None: """Remove files created by this analysis task and remove markers indicating that this analysis has been started, or has completed. @@ -377,12 +377,18 @@ def is_error(self, fragmentIndex=None): def is_complete(self, fragmentIndex=None): if fragmentIndex is None: - for i in range(self.fragment_count()): - if not self.is_complete(i): + missingCount = [] + if self.dataSet.check_analysis_done(self): + return True + else: + for i in range(self.fragment_count()): + if not self.is_complete(i): + missingCount.append(i) + if len(missingCount) > 0: return False - - return True - + else: + self.dataSet.record_analysis_complete(self) + return True else: return self.dataSet.check_analysis_done(self, fragmentIndex) diff --git a/merlin/core/dataset.py b/merlin/core/dataset.py index 8d8e3b0b..24b1e9e0 100755 --- a/merlin/core/dataset.py +++ b/merlin/core/dataset.py @@ -718,6 +718,7 @@ def record_analysis_error(self, analysisTask: analysistask.AnalysisTask, fragmentIndex: int = None) -> None: self._record_analysis_event(analysisTask, 'error', fragmentIndex) + def get_analysis_start_time(self, analysisTask: analysistask.AnalysisTask, fragmentIndex: int = None) -> float: """Get the time that this analysis task started @@ -805,6 +806,7 @@ def check_analysis_error(self, analysisTask: analysistask.AnalysisTask, fragmentIndex: int = None) -> bool: return self._check_analysis_event(analysisTask, 'error', fragmentIndex) + def reset_analysis_status(self, analysisTask: analysistask.AnalysisTask, fragmentIndex: int = None): if analysisTask.is_running(): @@ -814,7 +816,7 @@ def reset_analysis_status(self, analysisTask: analysistask.AnalysisTask, self._reset_analysis_event(analysisTask, 'run', fragmentIndex) self._reset_analysis_event(analysisTask, 'done', fragmentIndex) self._reset_analysis_event(analysisTask, 'error', fragmentIndex) - + self._reset_analysis_event(analysisTask, 'done') class ImageDataSet(DataSet): diff --git a/merlin/util/snakewriter.py b/merlin/util/snakewriter.py old mode 100644 new mode 100755 index acad3704..13f5efb4 --- a/merlin/util/snakewriter.py +++ b/merlin/util/snakewriter.py @@ -1,6 +1,6 @@ import importlib import networkx - +from merlin.analysis.paralleltaskcomplete import ParallelTaskComplete from merlin.core import analysistask from merlin.core import dataset @@ -25,21 +25,6 @@ def _expand_as_string(self, taskName, indexCount) -> str: self._analysisTask.dataSet.analysis_done_filename(taskName, '{g}')), indexCount) - def _generate_input_names(self, task): - if isinstance(task, analysistask.ParallelAnalysisTask): - return self._clean_string(self._expand_as_string( - task.get_analysis_name(), task.fragment_count())) - else: - return self._clean_string(self._add_quotes( - self._analysisTask.dataSet.analysis_done_filename(task))) - - def _generate_input(self) -> str: - inputTasks = [self._analysisTask.dataSet.load_analysis_task(x) - for x in self._analysisTask.get_dependencies()] - inputString = ','.join([self._generate_input_names(x) - for x in inputTasks]) - return self._clean_string(inputString) - def _generate_output(self) -> str: if isinstance(self._analysisTask, analysistask.ParallelAnalysisTask): return self._clean_string( @@ -47,10 +32,38 @@ def _generate_output(self) -> str: self._analysisTask.dataSet.analysis_done_filename( self._analysisTask, '{i}'))) else: - return self._clean_string( - self._add_quotes( - self._analysisTask.dataSet.analysis_done_filename( - self._analysisTask))) + if self._analysisTask.__class__ == ParallelTaskComplete: + dependentTask = self._analysisTask.parameters['dependent_task'] + taskToUse = self._analysisTask.dataSet.load_analysis_task( + dependentTask) + out = taskToUse.dataSet.analysis_done_filename(taskToUse) + return self._clean_string(self._add_quotes(out)) + else: + return self._clean_string( + self._add_quotes( + self._analysisTask.dataSet.analysis_done_filename( + self._analysisTask))) + + def _generate_current_task_inputs(self): + inputTasks = [self._analysisTask.dataSet.load_analysis_task(x) + for x in self._analysisTask.get_dependencies()] + if self._analysisTask.__class__ == ParallelTaskComplete: + inputString = [] + for t in inputTasks: + if isinstance(t, analysistask.ParallelAnalysisTask): + inputString.append(self._expand_as_string( + t, t.fragment_count())) + else: + inputString.append(t.dataSet.analysis_done_filename(t)) + inputString = ','.join(inputString) + else: + if len(inputTasks) > 0: + inputString = ','.join([self._add_quotes( + x.dataSet.analysis_done_filename(x)) for x in inputTasks]) + else: + inputString = '' + + return self._clean_string(inputString) def _generate_message(self) -> str: messageString = \ @@ -83,15 +96,23 @@ def _generate_shell(self) -> str: def as_string(self) -> str: fullString = ('rule %s:\n\tinput: %s\n\toutput: %s\n\tmessage: %s\n\t' - + 'shell: %s\n') \ - % (self._analysisTask.get_analysis_name(), - self._generate_input(), self._generate_output(), - self._generate_message(), self._generate_shell()) - + + 'shell: %s\n\n') \ + % (self._analysisTask.get_analysis_name(), + self._generate_current_task_inputs(), + self._generate_output(), + self._generate_message(), self._generate_shell()) return fullString def full_output(self) -> str: - return self._generate_input_names(self._analysisTask) + if isinstance(self._analysisTask, analysistask.ParallelAnalysisTask): + return self._clean_string(self._expand_as_string( + self._analysisTask.get_analysis_name(), + self._analysisTask.fragment_count())) + else: + return self._clean_string( + self._add_quotes( + self._analysisTask.dataSet.analysis_done_filename( + self._analysisTask))) class SnakefileGenerator(object): @@ -120,6 +141,19 @@ def _parse_parameters(self): analysisTasks[newTask.get_analysis_name()] = newTask return analysisTasks + def _add_parallel_completion_tasks(self, analysisTasks): + updatedTasks = {} + for k, v in analysisTasks.items(): + updatedTasks[k] = v + if isinstance(v, analysistask.ParallelAnalysisTask): + parameters = {'dependent_task': k} + newTask = ParallelTaskComplete(self._dataSet, parameters, + '{}Done'.format(k)) + if newTask.get_analysis_name() not in analysisTasks: + newTask.save() + updatedTasks[newTask.get_analysis_name()] = newTask + return updatedTasks + def _identify_terminal_tasks(self, analysisTasks): taskGraph = networkx.DiGraph() for x in analysisTasks.keys(): @@ -129,8 +163,7 @@ def _identify_terminal_tasks(self, analysisTasks): for d in a.get_dependencies(): taskGraph.add_edge(d, x) - return [k for k, v in taskGraph.out_degree if v == 0 - and not analysisTasks[k].is_complete()] + return [k for k, v in taskGraph.out_degree if v == 0] def generate_workflow(self) -> str: """Generate a snakemake workflow for the analysis parameters @@ -140,13 +173,16 @@ def generate_workflow(self) -> str: the path to the generated snakemake workflow """ analysisTasks = self._parse_parameters() + terminalTasks = self._identify_terminal_tasks(analysisTasks) + + analysisTasks = self._add_parallel_completion_tasks(analysisTasks) + ruleList = {k: SnakemakeRule(v, self._pythonPath) - for k, v in analysisTasks.items() if not v.is_complete()} + for k, v in analysisTasks.items()} - terminalTasks = self._identify_terminal_tasks(analysisTasks) workflowString = 'rule all: \n\tinput: ' + \ - ','.join([ruleList[x].full_output() for x in terminalTasks]) + \ - '\n\n' + ','.join([ruleList[x].full_output() + for x in terminalTasks]) + '\n\n' workflowString += '\n'.join([x.as_string() for x in ruleList.values()]) return self._dataSet.save_workflow(workflowString) diff --git a/test/conftest.py b/test/conftest.py old mode 100644 new mode 100755 diff --git a/test/test_snakemake.py b/test/test_snakemake.py old mode 100644 new mode 100755 From 63aebc5b5e557a3a3ff3e77153f306dc6e1a8ed4 Mon Sep 17 00:00:00 2001 From: leonardosepulveda Date: Wed, 22 Jan 2020 16:31:51 -0500 Subject: [PATCH 08/66] Fix bug in GenerateAdaptiveThreshold, avoids crash when resubmit (#40) * updated _run_analysis so GenerateAdaptiveThreshold (GAT) does not crash if the task is reran after a crash. The problem occurs only if the GAT folder is not deleted prior to a rerun. In that case, merlin would see that the .npy files exist so blankCounts would not be initialized. Merlin would crash when blankCounts += is called, because is trying to sum None and a matrix --- merlin/analysis/filterbarcodes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index ccb01ba3..c0efcbc2 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -260,7 +260,8 @@ def _run_analysis(self): updated = False while not all(completeFragments): - if intensityBins is None: + if (intensityBins is None or + blankCounts is None or codingCounts is None): for i in range(self.fragment_count()): if not pendingFragments[i] and decodeTask.is_complete(i): pendingFragments[i] = decodeTask.is_complete(i) From fe5c4de5c27c445c711535e727f2c1ffa6e89e1e Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Wed, 22 Jan 2020 18:13:51 -0500 Subject: [PATCH 09/66] Increment version number. --- CHANGELOG.md | 4 +++- setup.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da21e966..9c45c770 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,4 +29,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added task to evaluate whether a parallel analysis task has completed ### Changed - Changed the clean overlapping cells to run in parallel -- Snakemake job inputs were simplified using the ParallelCompleteTask to improve DAG construction speed and overall snakemake runtime performance \ No newline at end of file +- Snakemake job inputs were simplified using the ParallelCompleteTask to improve DAG construction speed and overall snakemake runtime performance + +## [0.1.5] - 2020-01-22 diff --git a/setup.py b/setup.py index c65fbcb8..fed80ba9 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setuptools.setup( name="merlin", - version="0.1.4", + version="0.1.5", description="MERFISH decoding software", author="George Emanuel", author_email="emanuega0@gmail.com", From 3e82b5297ebb7f581ac93052549094ea9a68f75b Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Thu, 30 Jan 2020 11:19:33 -0500 Subject: [PATCH 10/66] Updated aws bucket name. --- test/test_dataportal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_dataportal.py b/test/test_dataportal.py index e6a235ff..d5c1449f 100644 --- a/test/test_dataportal.py +++ b/test/test_dataportal.py @@ -23,7 +23,7 @@ def local_data_portal(): def s3_data_portal(): - yield dataportal.S3DataPortal('s3://merlin-test-bucket/test-files', + yield dataportal.S3DataPortal('s3://merlin-test-bucket-vg/test-files', region_name='us-east-2', config=Config(signature_version=UNSIGNED)) From ec5eb876f14b4773de26b0decae650985c98d27a Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Thu, 30 Jan 2020 11:35:26 -0500 Subject: [PATCH 11/66] Updated shapely requirements. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e84c4226..9b6a77a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ scipy>=1.2 matplotlib networkx rtree -shapely!=1.7a2 +shapely<1.7a2 seaborn>=0.9.0 pyqt5 Sphinx From b688a08a2d9ce515bd3d9d2615cd0dab093b75a1 Mon Sep 17 00:00:00 2001 From: leonardosepulveda Date: Sat, 15 Feb 2020 14:02:22 -0500 Subject: [PATCH 12/66] Fix docutil version requirements. --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 9b6a77a0..c65e9715 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,3 +28,4 @@ tables boto3 xmltodict google-cloud-storage +docutils<0.16,>=0.10 \ No newline at end of file From f221a3d58872ef1b0259bea87c52180f90c8266d Mon Sep 17 00:00:00 2001 From: timblosser Date: Thu, 27 Feb 2020 11:12:26 -0500 Subject: [PATCH 13/66] Updated filemap to only include filename so that the path can be easily changed --- CHANGELOG.md | 2 ++ merlin/data/dataorganization.py | 11 ++++++++++- merlin/util/dataportal.py | 35 ++++++++++++++++++++++++++++++--- requirements.txt | 2 +- test/test_dataportal.py | 2 +- 5 files changed, 46 insertions(+), 6 deletions(-) mode change 100644 => 100755 merlin/util/dataportal.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c45c770..3d6d90c3 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,3 +32,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Snakemake job inputs were simplified using the ParallelCompleteTask to improve DAG construction speed and overall snakemake runtime performance ## [0.1.5] - 2020-01-22 +### Changed +- Updated the filemap to only store the file name so that it can easily be pointed to new data home directories. This change maintains backward compatibility. diff --git a/merlin/data/dataorganization.py b/merlin/data/dataorganization.py index 28bab4f1..858841da 100755 --- a/merlin/data/dataorganization.py +++ b/merlin/data/dataorganization.py @@ -253,8 +253,13 @@ def _get_image_path( selection = self.fileMap[(self.fileMap['imageType'] == imageType) & (self.fileMap['fov'] == fov) & (self.fileMap['imagingRound'] == imagingRound)] + filemapPath = selection['imagePath'].values[0] + return os.path.join(self._dataSet.dataHome, self._dataSet.dataSetName, + filemapPath) - return selection['imagePath'].values[0] + def _truncate_file_path(self, path) -> None: + head, tail = os.path.split(path) + return tail def _map_image_files(self) -> None: # TODO: This doesn't map the fiducial image types and currently assumes @@ -263,6 +268,8 @@ def _map_image_files(self) -> None: try: self.fileMap = self._dataSet.load_dataframe_from_csv('filemap') + self.fileMap['imagePath'] = self.fileMap['imagePath'].apply( + self._truncate_file_path) except FileNotFoundError: uniqueEntries = self.data.drop_duplicates( @@ -302,6 +309,8 @@ def _map_image_files(self) -> None: self.fileMap = pandas.DataFrame(fileData) self.fileMap[['imagingRound', 'fov']] = \ self.fileMap[['imagingRound', 'fov']].astype(int) + self.fileMap['imagePath'] = self.fileMap['imagePath'].apply( + self._truncate_file_path) self._validate_file_map() diff --git a/merlin/util/dataportal.py b/merlin/util/dataportal.py old mode 100644 new mode 100755 index e3d0f6e8..3c8d2bb6 --- a/merlin/util/dataportal.py +++ b/merlin/util/dataportal.py @@ -2,9 +2,11 @@ import boto3 import botocore from google.cloud import storage +from google.cloud import exceptions from urllib import parse from abc import abstractmethod, ABC from typing import List +from time import sleep class DataPortal(ABC): @@ -340,12 +342,39 @@ def get_sibling_with_extension(self, newExtension: str): return GCloudFilePortal( self._exchange_extension(newExtension), self._client) + def _error_tolerant_reading(self, method, startByte=None, + endByte=None): + backoffSeries = [1, 2, 4, 8, 16, 32, 64, 128, 256] + for sleepDuration in backoffSeries: + try: + file = method(start=startByte, end=endByte) + return file + except (exceptions.GatewayTimeout, exceptions.ServiceUnavailable): + if sleepDuration == backoffSeries[-1]: + raise + else: + sleep(sleepDuration) + def read_as_text(self): - return self._fileHandle.download_as_string().decode('utf-8') + """ + Attempts to read a file from bucket as text, it if encounters a timeout + exception it reattempts after sleeping for exponentially increasing + delays, up to a delay of about 4 minutes + """ + file = self._error_tolerant_reading(self._fileHandle.download_as_string) + return file.decode('utf-8') def read_file_bytes(self, startByte, endByte): - return self._fileHandle.download_as_string( - start=startByte, end=endByte-1) + """ + Attempts to read a file from bucket as bytes, it if encounters a timeout + exception it reattempts after sleeping for exponentially increasing + delays, up to a delay of about 4 minutes + """ + file = self._error_tolerant_reading(self._fileHandle.download_as_string, + startByte=startByte, + endByte=endByte-1) + return file + def close(self) -> None: pass diff --git a/requirements.txt b/requirements.txt index e84c4226..9b6a77a0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ scipy>=1.2 matplotlib networkx rtree -shapely!=1.7a2 +shapely<1.7a2 seaborn>=0.9.0 pyqt5 Sphinx diff --git a/test/test_dataportal.py b/test/test_dataportal.py index e6a235ff..d5c1449f 100644 --- a/test/test_dataportal.py +++ b/test/test_dataportal.py @@ -23,7 +23,7 @@ def local_data_portal(): def s3_data_portal(): - yield dataportal.S3DataPortal('s3://merlin-test-bucket/test-files', + yield dataportal.S3DataPortal('s3://merlin-test-bucket-vg/test-files', region_name='us-east-2', config=Config(signature_version=UNSIGNED)) From ab9a4eea62ecbf736850073cc9b7bcdce9bbca57 Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Sun, 9 Feb 2020 16:52:44 -0500 Subject: [PATCH 14/66] Moved graph read and write to within dataset. --- merlin/analysis/segment.py | 21 ++++-------------- merlin/core/dataset.py | 44 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/merlin/analysis/segment.py b/merlin/analysis/segment.py index 00014983..786cb0d9 100755 --- a/merlin/analysis/segment.py +++ b/merlin/analysis/segment.py @@ -149,22 +149,9 @@ def get_dependencies(self): return [self.parameters['segment_task'], self.parameters['global_align_task']] - def _write_graph(self, analysisResult, resultName: str, - analysisName: str, resultIndex: int = None, - subdirectory: str = None) -> None: - - savePath = self.dataSet._analysis_result_save_path( - resultName, analysisName, resultIndex, subdirectory, '.gpickle') - nx.readwrite.gpickle.write_gpickle(analysisResult, savePath) - def return_exported_data(self, fragmentIndex) -> nx.Graph: - - savePath = self.dataSet._analysis_result_save_path( - 'cleaned_cells', self.analysisName, fragmentIndex, None, '.gpickle') - - loadedG = nx.readwrite.gpickle.read_gpickle(savePath) - - return loadedG + return self.dataSet.load_graph_from_gpickle( + 'cleaned_cells', self, fragmentIndex) def _run_analysis(self, fragmentIndex) -> None: allFOVs = np.array(self.dataSet.get_fovs()) @@ -192,8 +179,8 @@ def _run_analysis(self, fragmentIndex) -> None: spatialTree, fragmentIndex, allFOVs, fovBoxes) - self._write_graph(graph, 'cleaned_cells', - self.analysisName, fragmentIndex) + self.dataSet.save_graph_as_gpickle( + graph, 'cleaned_cells', self, fragmentIndex) class CombineCleanedBoundaries(analysistask.AnalysisTask): diff --git a/merlin/core/dataset.py b/merlin/core/dataset.py index 24b1e9e0..d53db90d 100755 --- a/merlin/core/dataset.py +++ b/merlin/core/dataset.py @@ -9,6 +9,7 @@ import logging import pickle import datetime +import networkx as nx from matplotlib import pyplot as plt from typing import List from typing import Tuple @@ -266,6 +267,49 @@ def list_analysis_files(self, analysisTask: TaskOrName = None, fileList = [os.path.join(basePath, x) for x in fileList] return fileList + def save_graph_as_gpickle( + self, graph: nx.Graph, resultName: str, + analysisTask: TaskOrName = None, resultIndex: int = None, + subdirectory: str = None): + """ Save a networkx graph as a gpickle into the analysis results + + Args: + graph: the networkx graph to save + resultName: the base name of the output file + analysisTask: the analysis task that the graph should be + saved under. If None, the graph is saved to the + data set root. + resultIndex: index of the graph to save or None if no index + should be specified + subdirectory: subdirectory of the analysis task that the graph + should be saved to or None if the graph should be + saved to the root directory for the analysis task. + """ + savePath = self._analysis_result_save_path( + resultName, analysisTask, resultIndex, subdirectory, '.gpickle') + nx.readwrite.gpickle.write_gpickle(graph, savePath) + + def load_graph_from_gpickle( + self, resultName: str, analysisTask: TaskOrName = None, + resultIndex: int = None, subdirectory: str = None): + """ Load a networkx graph from a gpickle objective saved in the analysis + results. + + Args: + resultName: the base name of the output file + analysisTask: the analysis task that the graph should be + saved under. If None, the graph is saved to the + data set root. + resultIndex: index of the graph to save or None if no index + should be specified + subdirectory: subdirectory of the analysis task that the graph + should be saved to or None if the graph should be + saved to the root directory for the analysis task. + """ + savePath = self._analysis_result_save_path( + resultName, analysisTask, resultIndex, subdirectory, '.gpickle') + return nx.readwrite.gpickle.read_gpickle(savePath) + def save_dataframe_to_csv( self, dataframe: pandas.DataFrame, resultName: str, analysisTask: TaskOrName = None, resultIndex: int = None, From 9b8a43a84aed075fda2e489e94dcaa62c565bbb6 Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Sun, 9 Feb 2020 20:40:41 -0500 Subject: [PATCH 15/66] Added ancient tag for snakemake for all inputs. --- merlin/util/snakewriter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/merlin/util/snakewriter.py b/merlin/util/snakewriter.py index 13f5efb4..9916018b 100755 --- a/merlin/util/snakewriter.py +++ b/merlin/util/snakewriter.py @@ -58,8 +58,9 @@ def _generate_current_task_inputs(self): inputString = ','.join(inputString) else: if len(inputTasks) > 0: - inputString = ','.join([self._add_quotes( - x.dataSet.analysis_done_filename(x)) for x in inputTasks]) + inputString = ','.join(['ancient(' + self._add_quotes( + x.dataSet.analysis_done_filename(x)) + ')' + for x in inputTasks]) else: inputString = '' From 7640da52001fe1742efa0a7ef6eab2a3fe79160c Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Mon, 9 Mar 2020 21:31:44 -0400 Subject: [PATCH 16/66] Updated how tracking of completion of parallel analysis tasks is done. --- merlin/analysis/paralleltaskcomplete.py | 26 -------- merlin/merlin.py | 18 +++++- merlin/util/snakewriter.py | 86 ++++++++++++------------- 3 files changed, 55 insertions(+), 75 deletions(-) delete mode 100755 merlin/analysis/paralleltaskcomplete.py diff --git a/merlin/analysis/paralleltaskcomplete.py b/merlin/analysis/paralleltaskcomplete.py deleted file mode 100755 index f8f43835..00000000 --- a/merlin/analysis/paralleltaskcomplete.py +++ /dev/null @@ -1,26 +0,0 @@ -from merlin.core import analysistask - - -class ParallelTaskComplete(analysistask.AnalysisTask): - """ - A task to simplify snakemake construction, there is no need to explicitly - invoke this task in analysis files, it is inferred by the snakewriter. - If running outside snakemake this class is not required. - """ - def __init__(self, dataSet, parameters=None, analysisName=None): - super().__init__(dataSet, parameters, analysisName) - - def get_estimated_memory(self): - return 100 - - def get_dependencies(self): - # The dependency is inferred by the snakewriter - return [self.parameters['dependent_task']] - - def get_estimated_time(self): - return 1 - - def _run_analysis(self): - dependentTask = self.dataSet.load_analysis_task( - self.parameters['dependent_task']) - dependentTask.is_complete() diff --git a/merlin/merlin.py b/merlin/merlin.py index 144360fb..c892baa3 100755 --- a/merlin/merlin.py +++ b/merlin/merlin.py @@ -40,6 +40,9 @@ def build_parser(): help='name of the position file to use') parser.add_argument('-n', '--core-count', type=int, help='number of cores to use for the analysis') + parser.add_argument('--check-done', action='store_true', + help='flag to only check if the analysis task is ' + + 'done') parser.add_argument( '-t', '--analysis-task', help='the name of the analysis task to execute. If no ' @@ -121,9 +124,18 @@ def merlin(): if not args.generate_only: if args.analysis_task: - print('Running %s' % args.analysis_task) - e.run(dataSet.load_analysis_task(args.analysis_task), - index=args.fragment_index) + task = dataSet.load_analysis_task(args.analysis_task) + if args.check_done: + # checking completion creates the .done file for parallel tasks + # where completion has not yet been checked + if task.is_complete(): + print('Task %s is complete' % args.analysis_task) + else: + print('Task %s is not complete' % args.analysis_task) + + else: + print('Running %s' % args.analysis_task) + e.run(task, index=args.fragment_index) elif snakefilePath: snakemakeParameters = {} if args.snakemake_parameters: diff --git a/merlin/util/snakewriter.py b/merlin/util/snakewriter.py index 9916018b..319f4886 100755 --- a/merlin/util/snakewriter.py +++ b/merlin/util/snakewriter.py @@ -1,6 +1,5 @@ import importlib import networkx -from merlin.analysis.paralleltaskcomplete import ParallelTaskComplete from merlin.core import analysistask from merlin.core import dataset @@ -32,50 +31,31 @@ def _generate_output(self) -> str: self._analysisTask.dataSet.analysis_done_filename( self._analysisTask, '{i}'))) else: - if self._analysisTask.__class__ == ParallelTaskComplete: - dependentTask = self._analysisTask.parameters['dependent_task'] - taskToUse = self._analysisTask.dataSet.load_analysis_task( - dependentTask) - out = taskToUse.dataSet.analysis_done_filename(taskToUse) - return self._clean_string(self._add_quotes(out)) - else: - return self._clean_string( - self._add_quotes( - self._analysisTask.dataSet.analysis_done_filename( - self._analysisTask))) + return self._clean_string( + self._add_quotes( + self._analysisTask.dataSet.analysis_done_filename( + self._analysisTask))) def _generate_current_task_inputs(self): inputTasks = [self._analysisTask.dataSet.load_analysis_task(x) for x in self._analysisTask.get_dependencies()] - if self._analysisTask.__class__ == ParallelTaskComplete: - inputString = [] - for t in inputTasks: - if isinstance(t, analysistask.ParallelAnalysisTask): - inputString.append(self._expand_as_string( - t, t.fragment_count())) - else: - inputString.append(t.dataSet.analysis_done_filename(t)) - inputString = ','.join(inputString) + if len(inputTasks) > 0: + inputString = ','.join(['ancient(' + self._add_quotes( + x.dataSet.analysis_done_filename(x)) + ')' + for x in inputTasks]) else: - if len(inputTasks) > 0: - inputString = ','.join(['ancient(' + self._add_quotes( - x.dataSet.analysis_done_filename(x)) + ')' - for x in inputTasks]) - else: - inputString = '' + inputString = '' return self._clean_string(inputString) def _generate_message(self) -> str: messageString = \ ''.join(['Running ', self._analysisTask.get_analysis_name()]) - if isinstance(self._analysisTask, analysistask.ParallelAnalysisTask): messageString += ' {wildcards.i}' - return self._add_quotes(messageString) - def _generate_shell(self) -> str: + def _base_shell_command(self) -> str: if self._pythonPath is None: shellString = 'python ' else: @@ -88,11 +68,23 @@ def _generate_shell(self) -> str: ' -s \"', self._clean_string(self._analysisTask.dataSet.analysisHome), '\"']) + return shellString + + def _generate_shell(self) -> str: + shellString = self._base_shell_command() if isinstance(self._analysisTask, analysistask.ParallelAnalysisTask): shellString += ' -i {wildcards.i}' shellString += ' ' + self._clean_string( self._analysisTask.dataSet.dataSetName) + return self._add_quotes(shellString) + def _generate_done_shell(self) -> str: + """ Check done shell command for parallel analysis tasks + """ + shellString = self._base_shell_command() + shellString += ' --check-done' + shellString += ' ' + self._clean_string( + self._analysisTask.dataSet.dataSetName) return self._add_quotes(shellString) def as_string(self) -> str: @@ -102,6 +94,23 @@ def as_string(self) -> str: self._generate_current_task_inputs(), self._generate_output(), self._generate_message(), self._generate_shell()) + # for parallel tasks, add a second snakemake task to reduce the time + # it takes to generate DAGs + if isinstance(self._analysisTask, analysistask.ParallelAnalysisTask): + fullString += \ + ('rule %s:\n\tinput: %s\n\toutput: %s\n\tmessage: %s\n\t' + + 'shell: %s\n\n')\ + % (self._analysisTask.get_analysis_name() + 'Done', + self._clean_string(self._expand_as_string( + self._analysisTask, + self._analysisTask.fragment_count())), + self._add_quotes(self._clean_string( + self._analysisTask.dataSet.analysis_done_filename( + self._analysisTask))), + self._add_quotes( + 'Checking %s done' % self._analysisTask.analysisName), + self._generate_done_shell()) + print(fullString) return fullString def full_output(self) -> str: @@ -119,7 +128,7 @@ def full_output(self) -> str: class SnakefileGenerator(object): def __init__(self, analysisParameters, dataSet: dataset.DataSet, - pythonPath: str=None): + pythonPath: str = None): self._analysisParameters = analysisParameters self._dataSet = dataSet self._pythonPath = pythonPath @@ -142,19 +151,6 @@ def _parse_parameters(self): analysisTasks[newTask.get_analysis_name()] = newTask return analysisTasks - def _add_parallel_completion_tasks(self, analysisTasks): - updatedTasks = {} - for k, v in analysisTasks.items(): - updatedTasks[k] = v - if isinstance(v, analysistask.ParallelAnalysisTask): - parameters = {'dependent_task': k} - newTask = ParallelTaskComplete(self._dataSet, parameters, - '{}Done'.format(k)) - if newTask.get_analysis_name() not in analysisTasks: - newTask.save() - updatedTasks[newTask.get_analysis_name()] = newTask - return updatedTasks - def _identify_terminal_tasks(self, analysisTasks): taskGraph = networkx.DiGraph() for x in analysisTasks.keys(): @@ -176,8 +172,6 @@ def generate_workflow(self) -> str: analysisTasks = self._parse_parameters() terminalTasks = self._identify_terminal_tasks(analysisTasks) - analysisTasks = self._add_parallel_completion_tasks(analysisTasks) - ruleList = {k: SnakemakeRule(v, self._pythonPath) for k, v in analysisTasks.items()} From 96af66eeff484ea56f66dfb24e28037424c62eb4 Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Mon, 9 Mar 2020 21:47:08 -0400 Subject: [PATCH 17/66] Removed print statement. --- merlin/util/snakewriter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/merlin/util/snakewriter.py b/merlin/util/snakewriter.py index 319f4886..5310c048 100755 --- a/merlin/util/snakewriter.py +++ b/merlin/util/snakewriter.py @@ -110,7 +110,6 @@ def as_string(self) -> str: self._add_quotes( 'Checking %s done' % self._analysisTask.analysisName), self._generate_done_shell()) - print(fullString) return fullString def full_output(self) -> str: From 9aef30561b3eeae33e00ce6cac45621282330f10 Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Tue, 10 Mar 2020 08:03:19 -0400 Subject: [PATCH 18/66] Reduced time to check if analysis task is idle. Now a analysis task can be restarted more quickly. --- merlin/core/dataset.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/merlin/core/dataset.py b/merlin/core/dataset.py index d53db90d..bc120af6 100755 --- a/merlin/core/dataset.py +++ b/merlin/core/dataset.py @@ -201,7 +201,7 @@ def get_analysis_image( analysisTask, imageBaseName, imageIndex)) indexInFile = sliceIndex*imagesPerSlice + frameIndex return imageFile.asarray(key=int(indexInFile)) - + def writer_for_analysis_images( self, analysisTask: TaskOrName, imageBaseName: str, imageIndex: int = None, imagej: bool = True) -> tifffile.TiffWriter: @@ -830,7 +830,7 @@ def is_analysis_idle(self, analysisTask: analysistask.AnalysisTask, fileName = self._analysis_status_file( analysisTask, 'run', fragmentIndex) try: - return time.time() - os.path.getmtime(fileName) > 120 + return time.time() - os.path.getmtime(fileName) > 1 except FileNotFoundError: return True @@ -850,7 +850,6 @@ def check_analysis_error(self, analysisTask: analysistask.AnalysisTask, fragmentIndex: int = None) -> bool: return self._check_analysis_event(analysisTask, 'error', fragmentIndex) - def reset_analysis_status(self, analysisTask: analysistask.AnalysisTask, fragmentIndex: int = None): if analysisTask.is_running(): From 502ba495da583d215dc551c2105e137fa8f4182b Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 25 Mar 2020 11:42:25 -0400 Subject: [PATCH 19/66] adding functionality to remove zplane duplicates --- merlin/analysis/filterbarcodes.py | 36 +++++++++++++++++++-------- merlin/util/duplicatebcremoval.py | 41 +++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 10 deletions(-) create mode 100755 merlin/util/duplicatebcremoval.py diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index c0efcbc2..dcb62303 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -5,7 +5,7 @@ from merlin.core import analysistask from merlin.data.codebook import Codebook from merlin.analysis import decode - +from merlin.util import duplicatebcremoval class FilterBarcodes(decode.BarcodeSavingParallelAnalysisTask): @@ -23,6 +23,13 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self.parameters['intensity_threshold'] = 200 if 'distance_threshold' not in self.parameters: self.parameters['distance_threshold'] = 1e6 + if 'remove_zplane_duplicates' not in self.parameters: + self.parameters['remove_zplane_duplicates'] = False + if self.parameters['remove_zplane_duplicates']: + if 'z_planes_above_below' not in self.parameters: + self.parameters['z_planes_above_below'] = 1 + if 'max_centroid_separation' not in self.parameters: + self.parameters['max_centroid_separation'] = np.sqrt(2) def fragment_count(self): return len(self.dataSet.get_fovs()) @@ -41,6 +48,15 @@ def get_codebook(self): self.parameters['decode_task']) return decodeTask.get_codebook() + def _remove_zplane_duplicates(self, barcodes, zPlanes, maxDist): + barcodeGroups = barcodes.groupby('barcode_id') + bcToKeep = [] + for bcGroup, bcData in barcodeGroups: + bcToKeep.append(duplicatebcremoval.cleanup_across_z(bcData, zPlanes, maxDist)) + mergedBC = pd.concat(bcToKeep, 0).reset_index(drop=True) + mergedBC = mergedBC.sort_values(by=['barcode_id', 'z']) + return mergedBC + def _run_analysis(self, fragmentIndex): decodeTask = self.dataSet.load_analysis_task( self.parameters['decode_task']) @@ -52,6 +68,10 @@ def _run_analysis(self, fragmentIndex): .get_filtered_barcodes(areaThreshold, intensityThreshold, distanceThreshold=distanceThreshold, fov=fragmentIndex) + if self.parameters['remove_zplane_duplicates']: + zPlanes = self.parameters['z_planes_above_below'] + maxDist = self.parameters['max_centroid_separation'] + currentBC = self._remove_zplane_duplicates(currentBC, zPlanes, maxDist) barcodeDB.write_barcodes(currentBC, fov=fragmentIndex) @@ -316,7 +336,7 @@ def extreme_values(inputData: pandas.Series): codingCounts, 'coding_counts', self) -class AdaptiveFilterBarcodes(decode.BarcodeSavingParallelAnalysisTask): +class AdaptiveFilterBarcodes(FilterBarcodes): """ An analysis task that filters barcodes based on a mean intensity threshold @@ -330,9 +350,6 @@ def __init__(self, dataSet, parameters=None, analysisName=None): if 'misidentification_rate' not in self.parameters: self.parameters['misidentification_rate'] = 0.05 - def fragment_count(self): - return len(self.dataSet.get_fovs()) - def get_estimated_memory(self): return 1000 @@ -352,11 +369,6 @@ def get_adaptive_thresholds(self): return self.dataSet.load_analysis_task( self.parameters['adaptive_task']) - def get_codebook(self) -> Codebook: - decodeTask = self.dataSet.load_analysis_task( - self.parameters['decode_task']) - return decodeTask.get_codebook() - def _run_analysis(self, fragmentIndex): adaptiveTask = self.dataSet.load_analysis_task( self.parameters['adaptive_task']) @@ -369,6 +381,10 @@ def _run_analysis(self, fragmentIndex): bcDatabase = self.get_barcode_database() currentBarcodes = decodeTask.get_barcode_database()\ .get_barcodes(fragmentIndex) + if self.parameters['remove_zplane_duplicates']: + zPlanes = self.parameters['z_planes_above_below'] + maxDist = self.parameters['max_centroid_separation'] + currentBarcodes = self._remove_zplane_duplicates(currentBarcodes, zPlanes, maxDist) bcDatabase.write_barcodes( adaptiveTask.extract_barcodes_with_threshold( threshold, currentBarcodes), fov=fragmentIndex) diff --git a/merlin/util/duplicatebcremoval.py b/merlin/util/duplicatebcremoval.py new file mode 100755 index 00000000..77dcb121 --- /dev/null +++ b/merlin/util/duplicatebcremoval.py @@ -0,0 +1,41 @@ +import numpy as np +from scipy.spatial import cKDTree +import networkx as nx +import pandas as pd + +def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) -> pd.DataFrame: + """ Depending on the separation between z planes, spots from a single molecule may be observed in more than + one z plane. These putative duplicates are removed based on supplied distance and z plane constraints. + In evaluating this method, when z planes are separated by 1.5 µm the likelihood of finding a putative + duplicate above or below the selected plane is ~5-10%, whereas the false-positive rate is closer to 1%, + as determined by checking two planes above or below, or comparing barcodes of different identities but + similar abundance between adjacent z planes. + + Args: + barcodes: a pandas dataframe containing all the entries for a given barcode identity + zPlanes: number of planes above and below to consider when evaluating potential duplicates + maxDist: maximum euclidean distance allowed to separate centroids of putative barcode duplicate, in pixels + Returns: + keptBarcodes: pandas dataframe where barcodes of the same identity that fall within parameters of z plane + duplicates have been removed + """ + barcodes.reset_index(drop = True, inplace = True) + graph = nx.Graph() + zPos = sorted(barcodes['z'].unique()) + graph.add_nodes_from(barcodes.index.values.tolist()) + for i in range(0, len(zPos)): + z = zPos[i] + zToCompare = [otherZ for pos, otherZ in enumerate(zPos) if + (pos >= i - zPlanes) & (pos <= i + zPlanes) & ~(pos == i)] + treeBC = barcodes[barcodes['z'] == z] + tree = cKDTree(treeBC.loc[:, ['x', 'y']].values) + for compZ in zToCompare: + queryBC = barcodes[barcodes['z'] == compZ] + dist, idx = tree.query(queryBC.loc[:, ['x', 'y']].values, k=1, distance_upper_bound=maxDist) + currentHits = treeBC.index.values[idx[np.isfinite(dist)]] + comparisonHits = queryBC.index.values[np.isfinite(dist)] + graph.add_edges_from(list(zip(currentHits, comparisonHits))) + connectedComponents = [list(x) for x in list(nx.connected_components(graph))] + keptBarcodes = barcodes.loc[sorted([x[0] if len(x) == 1 else random.choice(x) for x in connectedComponents]), :] + return keptBarcodes + From 703282cf98baa4f69197b0a10f86c39be770ae91 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 25 Mar 2020 12:10:05 -0400 Subject: [PATCH 20/66] fixing imports, adding to tests, updating docs --- docs/tasks.rst | 7 +++++++ merlin/analysis/filterbarcodes.py | 2 +- merlin/util/duplicatebcremoval.py | 3 ++- test/auxiliary_files/test_analysis_parameters.json | 10 ++++++++-- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/docs/tasks.rst b/docs/tasks.rst index 44789416..6aa2e3bb 100755 --- a/docs/tasks.rst +++ b/docs/tasks.rst @@ -68,6 +68,10 @@ Parameters: * area\_threshold -- Barcodes with areas below the specified area\_threshold are removed. * intensity\_threshold -- Barcodes with intensities below this threshold are removed. +* remove\_zplane\_duplicates -- Remove putative duplicate barcode counts from adjacent z planes. +* z\_planes\_above\_below -- If removing putative duplicate barcodes, number of adjacent z planes to consider, generally anything within 2 µm would be worth considering. +* max\_centroid\_separation -- If removing putative duplicate barcodes, maximum euclidean distance in xy pixels that can separate the centroids of putative duplicates. + filterbarcodes.GenerateAdaptiveThreshold ------------------------------------------- @@ -87,6 +91,9 @@ Description: Use an adaptive barcode to enrich the decode barcodes for the corre Parameters: * misidentification_rate -- The target misidentification rate, calculated as the number of blank barcodes per blank barcode divided by the number of coding barcodes per coding barcode. +* remove\_zplane\_duplicates -- Remove putative duplicate barcode counts from adjacent z planes. +* z\_planes\_above\_below -- If removing putative duplicate barcodes, number of adjacent z planes to consider, generally anything within 2 µm would be worth considering. +* max\_centroid\_separation -- If removing putative duplicate barcodes, maximum euclidean distance in xy pixels that can separate the centroids of putative duplicates. segment.SegmentCells ---------------------- diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index dcb62303..beb2da89 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -53,7 +53,7 @@ def _remove_zplane_duplicates(self, barcodes, zPlanes, maxDist): bcToKeep = [] for bcGroup, bcData in barcodeGroups: bcToKeep.append(duplicatebcremoval.cleanup_across_z(bcData, zPlanes, maxDist)) - mergedBC = pd.concat(bcToKeep, 0).reset_index(drop=True) + mergedBC = pandas.concat(bcToKeep, 0).reset_index(drop=True) mergedBC = mergedBC.sort_values(by=['barcode_id', 'z']) return mergedBC diff --git a/merlin/util/duplicatebcremoval.py b/merlin/util/duplicatebcremoval.py index 77dcb121..1a8311a4 100755 --- a/merlin/util/duplicatebcremoval.py +++ b/merlin/util/duplicatebcremoval.py @@ -2,6 +2,7 @@ from scipy.spatial import cKDTree import networkx as nx import pandas as pd +from random import choice def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) -> pd.DataFrame: """ Depending on the separation between z planes, spots from a single molecule may be observed in more than @@ -36,6 +37,6 @@ def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) -> pd comparisonHits = queryBC.index.values[np.isfinite(dist)] graph.add_edges_from(list(zip(currentHits, comparisonHits))) connectedComponents = [list(x) for x in list(nx.connected_components(graph))] - keptBarcodes = barcodes.loc[sorted([x[0] if len(x) == 1 else random.choice(x) for x in connectedComponents]), :] + keptBarcodes = barcodes.loc[sorted([x[0] if len(x) == 1 else choice(x) for x in connectedComponents]), :] return keptBarcodes diff --git a/test/auxiliary_files/test_analysis_parameters.json b/test/auxiliary_files/test_analysis_parameters.json index 4d27f2ec..dd9d553f 100755 --- a/test/auxiliary_files/test_analysis_parameters.json +++ b/test/auxiliary_files/test_analysis_parameters.json @@ -67,7 +67,10 @@ "parameters": { "decode_task": "Decode", "area_threshold": 5, - "intensity_threshold": 1 + "intensity_threshold": 1, + "remove_zplane_duplicates": true, + "z_planes_above_below": 1, + "max_centroid_separation": 1.414 } }, { @@ -83,7 +86,10 @@ "module": "merlin.analysis.filterbarcodes", "parameters": { "decode_task": "Decode", - "adaptive_task": "GenerateAdaptiveThreshold" + "adaptive_task": "GenerateAdaptiveThreshold", + "remove_zplane_duplicates": true, + "z_planes_above_below": 1, + "max_centroid_separation": 1.414 } }, { From d21f9a016619d794ce1155a138228b056a1f4c0c Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 25 Mar 2020 12:48:19 -0400 Subject: [PATCH 21/66] pep8 and changelog --- CHANGELOG.md | 2 ++ merlin/analysis/filterbarcodes.py | 9 +++++--- merlin/util/duplicatebcremoval.py | 38 +++++++++++++++++++------------ 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d6d90c3..fa07c40d 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,3 +34,5 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.1.5] - 2020-01-22 ### Changed - Updated the filemap to only store the file name so that it can easily be pointed to new data home directories. This change maintains backward compatibility. +### Added +- Parameters to filter tasks that enable removing barcodes that were putatively duplicated across adjacent z planes. \ No newline at end of file diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index beb2da89..2549123a 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -52,7 +52,8 @@ def _remove_zplane_duplicates(self, barcodes, zPlanes, maxDist): barcodeGroups = barcodes.groupby('barcode_id') bcToKeep = [] for bcGroup, bcData in barcodeGroups: - bcToKeep.append(duplicatebcremoval.cleanup_across_z(bcData, zPlanes, maxDist)) + bcToKeep.append(duplicatebcremoval.cleanup_across_z(bcData, zPlanes, + maxDist)) mergedBC = pandas.concat(bcToKeep, 0).reset_index(drop=True) mergedBC = mergedBC.sort_values(by=['barcode_id', 'z']) return mergedBC @@ -71,7 +72,8 @@ def _run_analysis(self, fragmentIndex): if self.parameters['remove_zplane_duplicates']: zPlanes = self.parameters['z_planes_above_below'] maxDist = self.parameters['max_centroid_separation'] - currentBC = self._remove_zplane_duplicates(currentBC, zPlanes, maxDist) + currentBC = self._remove_zplane_duplicates(currentBC, zPlanes, + maxDist) barcodeDB.write_barcodes(currentBC, fov=fragmentIndex) @@ -384,7 +386,8 @@ def _run_analysis(self, fragmentIndex): if self.parameters['remove_zplane_duplicates']: zPlanes = self.parameters['z_planes_above_below'] maxDist = self.parameters['max_centroid_separation'] - currentBarcodes = self._remove_zplane_duplicates(currentBarcodes, zPlanes, maxDist) + currentBarcodes = self._remove_zplane_duplicates(currentBarcodes, + zPlanes, maxDist) bcDatabase.write_barcodes( adaptiveTask.extract_barcodes_with_threshold( threshold, currentBarcodes), fov=fragmentIndex) diff --git a/merlin/util/duplicatebcremoval.py b/merlin/util/duplicatebcremoval.py index 1a8311a4..008637c4 100755 --- a/merlin/util/duplicatebcremoval.py +++ b/merlin/util/duplicatebcremoval.py @@ -4,21 +4,29 @@ import pandas as pd from random import choice -def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) -> pd.DataFrame: - """ Depending on the separation between z planes, spots from a single molecule may be observed in more than - one z plane. These putative duplicates are removed based on supplied distance and z plane constraints. - In evaluating this method, when z planes are separated by 1.5 µm the likelihood of finding a putative - duplicate above or below the selected plane is ~5-10%, whereas the false-positive rate is closer to 1%, - as determined by checking two planes above or below, or comparing barcodes of different identities but - similar abundance between adjacent z planes. +def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) \ + -> pd.DataFrame: + """ Depending on the separation between z planes, spots from a single + molecule may be observed in more than one z plane. These putative + duplicates are removed based on supplied distance and z plane + constraints. In evaluating this method, when z planes are separated + by 1.5 µm the likelihood of finding a putative duplicate above or below + the selected plane is ~5-10%, whereas the false-positive rate is closer + to 1%, as determined by checking two planes above or below, or comparing + barcodes of different identities but similar abundance between + adjacent z planes. Args: - barcodes: a pandas dataframe containing all the entries for a given barcode identity - zPlanes: number of planes above and below to consider when evaluating potential duplicates - maxDist: maximum euclidean distance allowed to separate centroids of putative barcode duplicate, in pixels + barcodes: a pandas dataframe containing all the entries for a given + barcode identity + zPlanes: number of planes above and below to consider when evaluating + potential duplicates + maxDist: maximum euclidean distance allowed to separate centroids of + putative barcode duplicate, in pixels Returns: - keptBarcodes: pandas dataframe where barcodes of the same identity that fall within parameters of z plane - duplicates have been removed + keptBarcodes: pandas dataframe where barcodes of the same identity that + fall within parameters of z plane duplicates have + been removed. """ barcodes.reset_index(drop = True, inplace = True) graph = nx.Graph() @@ -32,11 +40,13 @@ def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) -> pd tree = cKDTree(treeBC.loc[:, ['x', 'y']].values) for compZ in zToCompare: queryBC = barcodes[barcodes['z'] == compZ] - dist, idx = tree.query(queryBC.loc[:, ['x', 'y']].values, k=1, distance_upper_bound=maxDist) + dist, idx = tree.query(queryBC.loc[:, ['x', 'y']].values, k=1, + distance_upper_bound=maxDist) currentHits = treeBC.index.values[idx[np.isfinite(dist)]] comparisonHits = queryBC.index.values[np.isfinite(dist)] graph.add_edges_from(list(zip(currentHits, comparisonHits))) connectedComponents = [list(x) for x in list(nx.connected_components(graph))] - keptBarcodes = barcodes.loc[sorted([x[0] if len(x) == 1 else choice(x) for x in connectedComponents]), :] + keptBarcodes = barcodes.loc[sorted([x[0] if len(x) == 1 else choice(x) for x + in connectedComponents]), :] return keptBarcodes From 5ff99cfc027a2e60afd404c787f054558d2af669 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 25 Mar 2020 12:48:55 -0400 Subject: [PATCH 22/66] pep8 --- merlin/analysis/filterbarcodes.py | 1 + merlin/util/duplicatebcremoval.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index 2549123a..e9086ad1 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -391,3 +391,4 @@ def _run_analysis(self, fragmentIndex): bcDatabase.write_barcodes( adaptiveTask.extract_barcodes_with_threshold( threshold, currentBarcodes), fov=fragmentIndex) + \ No newline at end of file diff --git a/merlin/util/duplicatebcremoval.py b/merlin/util/duplicatebcremoval.py index 008637c4..0f75f6e8 100755 --- a/merlin/util/duplicatebcremoval.py +++ b/merlin/util/duplicatebcremoval.py @@ -49,4 +49,3 @@ def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) \ keptBarcodes = barcodes.loc[sorted([x[0] if len(x) == 1 else choice(x) for x in connectedComponents]), :] return keptBarcodes - From c24585c710804fc769d915aef790aacd50916359 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 25 Mar 2020 12:51:13 -0400 Subject: [PATCH 23/66] pep8 --- merlin/analysis/filterbarcodes.py | 3 +-- merlin/util/duplicatebcremoval.py | 6 ++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index e9086ad1..57b4939d 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -390,5 +390,4 @@ def _run_analysis(self, fragmentIndex): zPlanes, maxDist) bcDatabase.write_barcodes( adaptiveTask.extract_barcodes_with_threshold( - threshold, currentBarcodes), fov=fragmentIndex) - \ No newline at end of file + threshold, currentBarcodes), fov=fragmentIndex) \ No newline at end of file diff --git a/merlin/util/duplicatebcremoval.py b/merlin/util/duplicatebcremoval.py index 0f75f6e8..a53971e8 100755 --- a/merlin/util/duplicatebcremoval.py +++ b/merlin/util/duplicatebcremoval.py @@ -4,6 +4,7 @@ import pandas as pd from random import choice + def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) \ -> pd.DataFrame: """ Depending on the separation between z planes, spots from a single @@ -28,7 +29,7 @@ def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) \ fall within parameters of z plane duplicates have been removed. """ - barcodes.reset_index(drop = True, inplace = True) + barcodes.reset_index(drop=True, inplace=True) graph = nx.Graph() zPos = sorted(barcodes['z'].unique()) graph.add_nodes_from(barcodes.index.values.tolist()) @@ -45,7 +46,8 @@ def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) \ currentHits = treeBC.index.values[idx[np.isfinite(dist)]] comparisonHits = queryBC.index.values[np.isfinite(dist)] graph.add_edges_from(list(zip(currentHits, comparisonHits))) - connectedComponents = [list(x) for x in list(nx.connected_components(graph))] + connectedComponents = [list(x) for x in + list(nx.connected_components(graph))] keptBarcodes = barcodes.loc[sorted([x[0] if len(x) == 1 else choice(x) for x in connectedComponents]), :] return keptBarcodes From 01fbb7b20bad9df90c96dd895b1172428fca77fb Mon Sep 17 00:00:00 2001 From: seichhorn Date: Wed, 25 Mar 2020 12:53:36 -0400 Subject: [PATCH 24/66] pep8 --- merlin/analysis/filterbarcodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index 57b4939d..2549123a 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -390,4 +390,4 @@ def _run_analysis(self, fragmentIndex): zPlanes, maxDist) bcDatabase.write_barcodes( adaptiveTask.extract_barcodes_with_threshold( - threshold, currentBarcodes), fov=fragmentIndex) \ No newline at end of file + threshold, currentBarcodes), fov=fragmentIndex) From 43c7f7394373d6b321b90b7877ed03f52eab69bc Mon Sep 17 00:00:00 2001 From: seichhorn Date: Sat, 28 Mar 2020 17:06:28 -0400 Subject: [PATCH 25/66] Improved decoding speed --- CHANGELOG.md | 1 + merlin/analysis/decode.py | 2 +- merlin/analysis/globalalign.py | 30 +++++- merlin/analysis/optimize.py | 7 +- merlin/util/decoding.py | 164 +++++++++++++++++---------------- 5 files changed, 118 insertions(+), 86 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d6d90c3..e0c1c964 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,3 +34,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.1.5] - 2020-01-22 ### Changed - Updated the filemap to only store the file name so that it can easily be pointed to new data home directories. This change maintains backward compatibility. +- Improved decoding speed diff --git a/merlin/analysis/decode.py b/merlin/analysis/decode.py index 51d3693a..6cd1b1ce 100755 --- a/merlin/analysis/decode.py +++ b/merlin/analysis/decode.py @@ -209,6 +209,6 @@ def _extract_and_save_barcodes( self.get_barcode_database().write_barcodes( pandas.concat([decoder.extract_barcodes_with_index( i, decodedImage, pixelMagnitudes, pixelTraces, distances, fov, - self.cropWidth, zIndex, globalTask, None, minimumArea) + self.cropWidth, zIndex, globalTask, minimumArea) for i in range(self.get_codebook().get_barcode_count())]), fov=fov) diff --git a/merlin/analysis/globalalign.py b/merlin/analysis/globalalign.py index 405a7d40..68251a97 100755 --- a/merlin/analysis/globalalign.py +++ b/merlin/analysis/globalalign.py @@ -76,6 +76,19 @@ def get_global_extent(self) -> Tuple[float, float, float, float]: """ pass + @abstractmethod + def fov_coordinate_array_to_global(self, fov: int, + fovCoordArray: np.array) -> np.array: + """A bulk transformation of a list of fov coordinates to + global coordinates. + Args: + fov: the fov of interest + fovCoordArray: numpy array of the [z, x, y] positions to transform + Returns: + numpy array of the global [z, x, y] coordinates + """ + pass + def get_fov_boxes(self) -> List: """ Creates a list of shapely boxes for each fov containing the global @@ -90,7 +103,6 @@ def get_fov_boxes(self) -> List: return boxes - class SimpleGlobalAlignment(GlobalAlignment): """A global alignment that uses the theoretical stage positions in @@ -126,10 +138,18 @@ def fov_coordinates_to_global(self, fov, fovCoordinates): fovStart[0] + fovCoordinates[1]*micronsPerPixel, fovStart[1] + fovCoordinates[2]*micronsPerPixel) - def fov_global_extent(self, fov: int) -> List[float]: + def fov_coordinate_array_to_global(self, fov: int, + fovCoordArray: np.array) -> np.array: + tForm = self.fov_to_global_transform(fov) + toGlobal = np.ones(fovCoordArray.shape) + toGlobal[:, [0, 1]] = fovCoordArray[:, [1, 2]] + globalCentroids = np.matmul(tForm, toGlobal.T).T[:, [2, 0, 1]] + globalCentroids[:, 0] = fovCoordArray[:, 0] + return globalCentroids + def fov_global_extent(self, fov: int) -> List[float]: """ - Returns the global extent of an fov, output interleaved as + Returns the global extent of a fov, output interleaved as xmin, ymin, xmax, ymax Args: @@ -204,6 +224,10 @@ def fov_to_global_transform(self, fov): def get_global_extent(self): raise NotImplementedError + def fov_coordinate_array_to_global(self, fov: int, + fovCoordArray: np.array) -> np.array: + raise NotImplementedError + @staticmethod def _calculate_overlap_area(x1, y1, x2, y2, width, height): """Calculates the overlapping area between two rectangles with diff --git a/merlin/analysis/optimize.py b/merlin/analysis/optimize.py index e4cdafe2..81787fa7 100755 --- a/merlin/analysis/optimize.py +++ b/merlin/analysis/optimize.py @@ -30,6 +30,8 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self.parameters['optimize_background'] = False if 'optimize_chromatic_correction' not in self.parameters: self.parameters['optimize_chromatic_correction'] = False + if 'crop_width' not in self.parameters: + self.parameters['crop_width'] = 0 def get_estimated_memory(self): return 4000 @@ -97,10 +99,11 @@ def _run_analysis(self, fragmentIndex): # TODO this saves the barcodes under fragment instead of fov # the barcodedb should be made more general + cropWidth = self.parameters['crop_width'] self.get_barcode_database().write_barcodes( pandas.concat([decoder.extract_barcodes_with_index( - i, di, pm, npt, d, fovIndex, - 10, zIndex, minimumArea=areaThreshold) + i, di, pm, npt, d, fovIndex, cropWidth, + zIndex, minimumArea=areaThreshold) for i in range(codebook.get_barcode_count())]), fov=fragmentIndex) self.dataSet.save_numpy_analysis_result( diff --git a/merlin/util/decoding.py b/merlin/util/decoding.py index 693de247..2447917f 100644 --- a/merlin/util/decoding.py +++ b/merlin/util/decoding.py @@ -130,13 +130,11 @@ def decode_pixels(self, imageData: np.ndarray, return decodedImage, pixelMagnitudes, normalizedPixelTraces, distances - # TODO barcodes here has two different meanings. One of these should be - # renamed. def extract_barcodes_with_index( self, barcodeIndex: int, decodedImage: np.ndarray, pixelMagnitudes: np.ndarray, pixelTraces: np.ndarray, - distances: np.ndarray, fov: int, cropWidth: int, zIndex: int=None, - globalAligner=None, segmenter=None, minimumArea: int=0 + distances: np.ndarray, fov: int, cropWidth: int, zIndex: int = None, + globalAligner=None, minimumArea: int = 0 ) -> pandas.DataFrame: """Extract the barcode information from the decoded image for barcodes that were decoded to the specified barcode index. @@ -158,8 +156,6 @@ def extract_barcodes_with_index( zIndex: the index of the z position globalAligner: the aligner used for converted to local x,y coordinates to global x,y coordinates - segmenter: the cell segmenter for assigning a cell for each of the - identified barcodes minimumArea: the minimum area of barcodes to identify. Barcodes less than the specified minimum area are ignored. Returns: @@ -167,85 +163,93 @@ def extract_barcodes_with_index( specified barcode index """ properties = measure.regionprops( - measure.label(decodedImage == barcodeIndex), - intensity_image=pixelMagnitudes, - cache=False) - dList = [self._bc_properties_to_dict( - p, barcodeIndex, fov, distances, pixelTraces, zIndex, - globalAligner, segmenter - ) for p in properties - if self._position_within_crop( - p.centroid, cropWidth, decodedImage.shape) - and p.area >= minimumArea] - barcodeInformation = pandas.DataFrame(dList) - - return barcodeInformation - - @staticmethod - def _position_within_crop(position: np.ndarray, cropWidth: float, - imageSize: Tuple[int]) -> bool: - if len(position) == 2: - return cropWidth < position[0] < imageSize[0] - cropWidth \ - and cropWidth < position[1] < imageSize[1] - cropWidth + measure.label(decodedImage == barcodeIndex), + intensity_image=pixelMagnitudes, + cache=False) + is3D = len(pixelTraces.shape) == 4 + + columnNames = ['barcode_id', 'fov', 'mean_intensity', 'max_intensity', + 'area', 'mean_distance', 'min_distance', 'x', 'y', 'z', + 'global_x', 'global_y', 'global_z', 'cell_index'] + if is3D: + intensityColumns = ['intensity_{}'.format(i) for i in + range(pixelTraces.shape[1])] else: - return cropWidth < position[1] < imageSize[1] - cropWidth \ - and cropWidth < position[2] < imageSize[2] - cropWidth - - def _bc_properties_to_dict(self, properties, bcIndex: int, fov: int, - distances: np.ndarray, pixelTraces: np.ndarray, - zIndex: int=None, globalAligner=None, - segmenter=None) -> Dict: - # centroid is reversed since skimage regionprops returns the centroid - # as (r,c) - inputCentroid = properties.weighted_centroid - if len(inputCentroid) == 2: - centroid = [zIndex, inputCentroid[1], inputCentroid[0]] - else: - centroid = [inputCentroid[0], inputCentroid[2], inputCentroid[1]] + intensityColumns = ['intensity_{}'.format(i) for i in + range(pixelTraces.shape[0])] + if len(properties) == 0: + return pandas.DataFrame(columns=columnNames + intensityColumns) + + allCoords = [list(p.coords) for p in properties] + + if is3D: + centroidCoords = np.array( + [prop.weighted_centroid for prop in properties]) + centroids = centroidCoords[:, [0, 2, 1]] + d = [[distances[y[0], y[1], y[2]] for y in x] for x in allCoords] + intensityAndAreas = np.array([[x.mean_intensity, + x.max_intensity, + x.area] for x in properties]) + intensities = [ + [pixelTraces[y[0], :, y[1], y[2]] for y in x] for x in + allCoords] + intensities = pandas.DataFrame( + [np.mean(x, 0) if len(x) > 1 else x[0] for x in intensities], + columns=intensityColumns) - if globalAligner is not None: - globalCentroid = globalAligner.fov_coordinates_to_global( - fov, centroid) else: - globalCentroid = centroid + intensityAndCoords = [ + np.array([[y[0], y[1], pixelMagnitudes[y[0], y[1]]] for y in x]) + for x in allCoords] + centroidCoords = np.array( + [[(r[:, 0] * (r[:, -1] / r[:, -1].sum())).sum(), + (r[:, 1] * (r[:, -1] / r[:, -1].sum())).sum()] + if r.shape[0] > 1 else [r[0][0], r[0][1]] + for r in intensityAndCoords]) + centroids = np.zeros((centroidCoords.shape[0], 3)) + centroids[:, 0] = zIndex + centroids[:, [1, 2]] = centroidCoords[:, [1, 0]] + d = [[distances[y[0], y[1]] for y in x] for x in allCoords] + intensityAndAreas = np.array([[x[:, 2].mean(), + x[:, 2].max(), + x.shape[0]] + for x in intensityAndCoords]) + intensities = [[pixelTraces[:, y[0], y[1]] for y in x] for + x in allCoords] + intensities = pandas.DataFrame( + [np.mean(x, 0) if len(x) > 1 else x[0] for x in intensities], + columns=intensityColumns) - if len(distances.shape) == 2: - d = [distances[x[0], x[1]] for x in properties.coords] - else: - d = [distances[x[0], x[1], x[2]] for x in properties.coords] - - outputDict = {'barcode_id': bcIndex, - 'fov': fov, - 'mean_intensity': properties.mean_intensity, - 'max_intensity': properties.max_intensity, - 'area': properties.area, - 'mean_distance': np.mean(d), - 'min_distance': np.min(d), - 'x': centroid[1], - 'y': centroid[2], - 'z': centroid[0], - 'global_x': globalCentroid[1], - 'global_y': globalCentroid[2], - 'global_z': globalCentroid[0], - 'cell_index': -1} - - if len(pixelTraces.shape) == 3: - for i in range(len(pixelTraces)): - outputDict['intensity_' + str(i)] = \ - np.mean([pixelTraces[i, x[0], x[1]] - for x in properties.coords]) + if globalAligner is not None: + globalCentroids = globalAligner.fov_coordinate_array_to_global( + fov, centroids) else: - for i in range(len(pixelTraces[0])): - outputDict['intensity_' + str(i)] = \ - np.mean([pixelTraces[x[0], i, x[1], x[2]] - for x in properties.coords]) - - if segmenter is not None: - outputDict['cell_index'] = segmenter \ - .get_cell_containing_position( - globalCentroid[0], globalCentroid[1]) - - return outputDict + globalCentroids = centroids + + df = pandas.DataFrame(np.zeros((len(properties), len(columnNames))), + columns=columnNames) + df['barcode_id'] = barcodeIndex + df['fov'] = fov + df.loc[:, ['mean_intensity', 'max_intensity', 'area']] = \ + intensityAndAreas + df.loc[:, ['mean_distance', 'min_distance']] = np.array( + [[np.mean(x), np.min(x)] if len(x) > 1 else [x[0], x[0]] for x in + d]) + df.loc[:, ['x', 'y', 'z']] = centroids[:, [1, 2, 0]] + df.loc[:, ['global_x', 'global_y', 'global_z']] = \ + globalCentroids[:, [1, 2, 0]] + df['cell_index'] = -1 + + fullDF = pandas.concat([df, intensities], 1) + fullDF = fullDF[(fullDF['x'].between(cropWidth, + decodedImage.shape[0] - cropWidth, + inclusive=False)) & + (fullDF['y'].between(cropWidth, + decodedImage.shape[1] - cropWidth, + inclusive=False)) & + (fullDF['area'] >= minimumArea)] + + return fullDF def _calculate_normalized_barcodes( self, ignoreBlanks=False, includeErrors=False): From a120ba3751217113ee3b76e784957517da16d83f Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Mon, 30 Mar 2020 11:11:27 -0400 Subject: [PATCH 26/66] restructuring to address comments --- merlin/analysis/filterbarcodes.py | 61 ++++++++++---------- merlin/util/barcodefilters.py | 96 +++++++++++++++++++++++++++++++ merlin/util/duplicatebcremoval.py | 53 ----------------- 3 files changed, 126 insertions(+), 84 deletions(-) create mode 100755 merlin/util/barcodefilters.py delete mode 100755 merlin/util/duplicatebcremoval.py diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index 2549123a..d1c93415 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -3,11 +3,35 @@ from scipy import optimize from merlin.core import analysistask -from merlin.data.codebook import Codebook from merlin.analysis import decode -from merlin.util import duplicatebcremoval +from merlin.util import barcodefilters -class FilterBarcodes(decode.BarcodeSavingParallelAnalysisTask): + +class AbstractFilterBarcodes(decode.BarcodeSavingParallelAnalysisTask): + """ + An abstract class for filtering barcodes identified by pixel-based decoding. + """ + + def __init__(self, dataSet, parameters=None, analysisName=None): + super().__init__(dataSet, parameters, analysisName) + + if 'remove_z_duplicated_barcodes' not in self.parameters: + self.parameters['remove_z_duplicated_barcodes'] = False + if self.parameters['remove_z_duplicated_barcodes']: + if 'z_duplicate_zPlane_threshold' not in self.parameters: + self.parameters['z_duplicate_zPlane_threshold'] = 1 + if 'z_duplicate_xy_pixel_threshold' not in self.parameters: + self.parameters['z_duplicate_xy_pixel_threshold'] = np.sqrt(2) + + def remove_z_duplicate_barcodes(self, bc): + if self.parameters['remove_z_duplicated_barcodes']: + bc = barcodefilters.remove_zplane_duplicates_all_barcodeids( + bc, self.parameters['z_duplicate_zPlane_threshold'], + self.parameters['z_duplicate_xy_pixel_threshold']) + return bc + + +class FilterBarcodes(AbstractFilterBarcodes): """ An analysis task that filters barcodes based on area and mean @@ -23,13 +47,6 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self.parameters['intensity_threshold'] = 200 if 'distance_threshold' not in self.parameters: self.parameters['distance_threshold'] = 1e6 - if 'remove_zplane_duplicates' not in self.parameters: - self.parameters['remove_zplane_duplicates'] = False - if self.parameters['remove_zplane_duplicates']: - if 'z_planes_above_below' not in self.parameters: - self.parameters['z_planes_above_below'] = 1 - if 'max_centroid_separation' not in self.parameters: - self.parameters['max_centroid_separation'] = np.sqrt(2) def fragment_count(self): return len(self.dataSet.get_fovs()) @@ -48,16 +65,6 @@ def get_codebook(self): self.parameters['decode_task']) return decodeTask.get_codebook() - def _remove_zplane_duplicates(self, barcodes, zPlanes, maxDist): - barcodeGroups = barcodes.groupby('barcode_id') - bcToKeep = [] - for bcGroup, bcData in barcodeGroups: - bcToKeep.append(duplicatebcremoval.cleanup_across_z(bcData, zPlanes, - maxDist)) - mergedBC = pandas.concat(bcToKeep, 0).reset_index(drop=True) - mergedBC = mergedBC.sort_values(by=['barcode_id', 'z']) - return mergedBC - def _run_analysis(self, fragmentIndex): decodeTask = self.dataSet.load_analysis_task( self.parameters['decode_task']) @@ -69,11 +76,7 @@ def _run_analysis(self, fragmentIndex): .get_filtered_barcodes(areaThreshold, intensityThreshold, distanceThreshold=distanceThreshold, fov=fragmentIndex) - if self.parameters['remove_zplane_duplicates']: - zPlanes = self.parameters['z_planes_above_below'] - maxDist = self.parameters['max_centroid_separation'] - currentBC = self._remove_zplane_duplicates(currentBC, zPlanes, - maxDist) + currentBC = self.remove_z_duplicate_barcodes(currentBC) barcodeDB.write_barcodes(currentBC, fov=fragmentIndex) @@ -338,7 +341,7 @@ def extreme_values(inputData: pandas.Series): codingCounts, 'coding_counts', self) -class AdaptiveFilterBarcodes(FilterBarcodes): +class AdaptiveFilterBarcodes(AbstractFilterBarcodes): """ An analysis task that filters barcodes based on a mean intensity threshold @@ -383,11 +386,7 @@ def _run_analysis(self, fragmentIndex): bcDatabase = self.get_barcode_database() currentBarcodes = decodeTask.get_barcode_database()\ .get_barcodes(fragmentIndex) - if self.parameters['remove_zplane_duplicates']: - zPlanes = self.parameters['z_planes_above_below'] - maxDist = self.parameters['max_centroid_separation'] - currentBarcodes = self._remove_zplane_duplicates(currentBarcodes, - zPlanes, maxDist) + currentBarcodes = self.remove_z_duplicate_barcodes(currentBarcodes) bcDatabase.write_barcodes( adaptiveTask.extract_barcodes_with_threshold( threshold, currentBarcodes), fov=fragmentIndex) diff --git a/merlin/util/barcodefilters.py b/merlin/util/barcodefilters.py new file mode 100755 index 00000000..0d6b5615 --- /dev/null +++ b/merlin/util/barcodefilters.py @@ -0,0 +1,96 @@ +import numpy as np +from scipy.spatial import cKDTree +import networkx as nx +import pandas as pd + + +def remove_zplane_duplicates_all_barcodeids(barcodes: pd.DataFrame, + zPlanes: int, + maxDist: float) -> pd.DataFrame: + """ Depending on the separation between z planes, spots from a single + molecule may be observed in more than one z plane. These putative + duplicates are removed based on supplied distance and z plane + constraints. In evaluating this method, when z planes are separated + by 1.5 µm the likelihood of finding a putative duplicate above or below + the selected plane is ~5-10%, whereas the false-positive rate is closer + to 1%, as determined by checking two planes above or below, or comparing + barcodes of different identities but similar abundance between + adjacent z planes. + + Args: + barcodes: a pandas dataframe containing all the entries for a given + barcode identity + zPlanes: number of planes above and below to consider when evaluating + potential duplicates + maxDist: maximum euclidean distance allowed to separate centroids of + putative barcode duplicate, in pixels + Returns: + keptBarcodes: pandas dataframe where barcodes of the same identity that + fall within parameters of z plane duplicates have + been removed. + """ + barcodeGroups = barcodes.groupby('barcode_id') + bcToKeep = [] + for bcGroup, bcData in barcodeGroups: + bcToKeep.append( + barcodefilters.remove_zplane_duplicates_single_barcodeid(bcData, + zPlanes, + maxDist)) + mergedBC = pandas.concat(bcToKeep, 0).reset_index(drop=True) + mergedBC = mergedBC.sort_values(by=['barcode_id', 'z']) + return mergedBC + +def remove_zplane_duplicates_single_barcodeid(barcodes: pd.DataFrame, + zPlanes: int, + maxDist: float) -> pd.DataFrame: + """ Remove barcodes with a given barcode id that are putative z plane + duplicates. + + Args: + barcodes: a pandas dataframe containing all the entries for a given + barcode identity + zPlanes: number of planes above and below to consider when evaluating + potential duplicates + maxDist: maximum euclidean distance allowed to separate centroids of + putative barcode duplicate, in pixels + Returns: + keptBarcodes: pandas dataframe where barcodes of the same identity that + fall within parameters of z plane duplicates have + been removed. + """ + barcodes.reset_index(drop=True, inplace=True) + if not len(barcodes['barcode_id'].unique()) == 1: + errorString = 'The method remove_zplane_duplicates_single_barcodeid ' +\ + 'should be given a dataframe containing molecules ' +\ + 'that all have the same barcode id. Please use ' +\ + 'remove_zplane_duplicates_all_barcodeids to handle ' +\ + 'dataframes containing multiple barcode ids' + raise ValueError(errorString) + graph = nx.Graph() + zPos = sorted(barcodes['z'].unique()) + graph.add_nodes_from(barcodes.index.values.tolist()) + for i in range(0, len(zPos)): + z = zPos[i] + zToCompare = [otherZ for pos, otherZ in enumerate(zPos) if + (pos >= i - zPlanes) & (pos <= i + zPlanes) & ~(pos == i)] + treeBC = barcodes[barcodes['z'] == z] + tree = cKDTree(treeBC.loc[:, ['x', 'y']].values) + for compZ in zToCompare: + queryBC = barcodes[barcodes['z'] == compZ] + dist, idx = tree.query(queryBC.loc[:, ['x', 'y']].values, k=1, + distance_upper_bound=maxDist) + currentHits = treeBC.index.values[idx[np.isfinite(dist)]] + comparisonHits = queryBC.index.values[np.isfinite(dist)] + graph.add_edges_from(list(zip(currentHits, comparisonHits))) + connectedComponents = [list(x) for x in + list(nx.connected_components(graph))] + + def choose_brighter_barcode(barcodes, indexes): + sortedBC = barcodes.loc[indexes, :].sort_values(by='mean_intensity', + ascending=False) + return sortedBC.index.values.tolist()[0] + + keptBarcodes = barcodes.loc[sorted([x[0] if len(x) == 1 else + choose_brighter_barcode(barcodes, x) + for x in connectedComponents]), :] + return keptBarcodes diff --git a/merlin/util/duplicatebcremoval.py b/merlin/util/duplicatebcremoval.py deleted file mode 100755 index a53971e8..00000000 --- a/merlin/util/duplicatebcremoval.py +++ /dev/null @@ -1,53 +0,0 @@ -import numpy as np -from scipy.spatial import cKDTree -import networkx as nx -import pandas as pd -from random import choice - - -def cleanup_across_z(barcodes: pd.DataFrame, zPlanes: int, maxDist: float) \ - -> pd.DataFrame: - """ Depending on the separation between z planes, spots from a single - molecule may be observed in more than one z plane. These putative - duplicates are removed based on supplied distance and z plane - constraints. In evaluating this method, when z planes are separated - by 1.5 µm the likelihood of finding a putative duplicate above or below - the selected plane is ~5-10%, whereas the false-positive rate is closer - to 1%, as determined by checking two planes above or below, or comparing - barcodes of different identities but similar abundance between - adjacent z planes. - - Args: - barcodes: a pandas dataframe containing all the entries for a given - barcode identity - zPlanes: number of planes above and below to consider when evaluating - potential duplicates - maxDist: maximum euclidean distance allowed to separate centroids of - putative barcode duplicate, in pixels - Returns: - keptBarcodes: pandas dataframe where barcodes of the same identity that - fall within parameters of z plane duplicates have - been removed. - """ - barcodes.reset_index(drop=True, inplace=True) - graph = nx.Graph() - zPos = sorted(barcodes['z'].unique()) - graph.add_nodes_from(barcodes.index.values.tolist()) - for i in range(0, len(zPos)): - z = zPos[i] - zToCompare = [otherZ for pos, otherZ in enumerate(zPos) if - (pos >= i - zPlanes) & (pos <= i + zPlanes) & ~(pos == i)] - treeBC = barcodes[barcodes['z'] == z] - tree = cKDTree(treeBC.loc[:, ['x', 'y']].values) - for compZ in zToCompare: - queryBC = barcodes[barcodes['z'] == compZ] - dist, idx = tree.query(queryBC.loc[:, ['x', 'y']].values, k=1, - distance_upper_bound=maxDist) - currentHits = treeBC.index.values[idx[np.isfinite(dist)]] - comparisonHits = queryBC.index.values[np.isfinite(dist)] - graph.add_edges_from(list(zip(currentHits, comparisonHits))) - connectedComponents = [list(x) for x in - list(nx.connected_components(graph))] - keptBarcodes = barcodes.loc[sorted([x[0] if len(x) == 1 else choice(x) for x - in connectedComponents]), :] - return keptBarcodes From 47893d8da7d087ecfe894948d963ee059a7f09fd Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Mon, 30 Mar 2020 13:28:01 -0400 Subject: [PATCH 27/66] updating for unit test --- merlin/analysis/filterbarcodes.py | 6 +++++- merlin/util/barcodefilters.py | 14 +++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index d1c93415..2d671dea 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -27,7 +27,8 @@ def remove_z_duplicate_barcodes(self, bc): if self.parameters['remove_z_duplicated_barcodes']: bc = barcodefilters.remove_zplane_duplicates_all_barcodeids( bc, self.parameters['z_duplicate_zPlane_threshold'], - self.parameters['z_duplicate_xy_pixel_threshold']) + self.parameters['z_duplicate_xy_pixel_threshold'], + self.dataSet.get_z_positions()) return bc @@ -355,6 +356,9 @@ def __init__(self, dataSet, parameters=None, analysisName=None): if 'misidentification_rate' not in self.parameters: self.parameters['misidentification_rate'] = 0.05 + def fragment_count(self): + return len(self.dataSet.get_fovs()) + def get_estimated_memory(self): return 1000 diff --git a/merlin/util/barcodefilters.py b/merlin/util/barcodefilters.py index 0d6b5615..45c751c7 100755 --- a/merlin/util/barcodefilters.py +++ b/merlin/util/barcodefilters.py @@ -2,11 +2,13 @@ from scipy.spatial import cKDTree import networkx as nx import pandas as pd +from typing import List def remove_zplane_duplicates_all_barcodeids(barcodes: pd.DataFrame, zPlanes: int, - maxDist: float) -> pd.DataFrame: + maxDist: float, + allZPos: List) -> pd.DataFrame: """ Depending on the separation between z planes, spots from a single molecule may be observed in more than one z plane. These putative duplicates are removed based on supplied distance and z plane @@ -35,14 +37,16 @@ def remove_zplane_duplicates_all_barcodeids(barcodes: pd.DataFrame, bcToKeep.append( barcodefilters.remove_zplane_duplicates_single_barcodeid(bcData, zPlanes, - maxDist)) - mergedBC = pandas.concat(bcToKeep, 0).reset_index(drop=True) + maxDist, + allZPos)) + mergedBC = pd.concat(bcToKeep, 0).reset_index(drop=True) mergedBC = mergedBC.sort_values(by=['barcode_id', 'z']) return mergedBC def remove_zplane_duplicates_single_barcodeid(barcodes: pd.DataFrame, zPlanes: int, - maxDist: float) -> pd.DataFrame: + maxDist: float, + allZPos: List) -> pd.DataFrame: """ Remove barcodes with a given barcode id that are putative z plane duplicates. @@ -67,7 +71,7 @@ def remove_zplane_duplicates_single_barcodeid(barcodes: pd.DataFrame, 'dataframes containing multiple barcode ids' raise ValueError(errorString) graph = nx.Graph() - zPos = sorted(barcodes['z'].unique()) + zPos = sorted(allZPos) graph.add_nodes_from(barcodes.index.values.tolist()) for i in range(0, len(zPos)): z = zPos[i] From d0f652346f05bc9e3e94bc415974faa96777b2d2 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Mon, 30 Mar 2020 13:47:10 -0400 Subject: [PATCH 28/66] updating for unit test --- merlin/analysis/filterbarcodes.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index 2d671dea..744f52f7 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -369,6 +369,11 @@ def get_dependencies(self): return [self.parameters['adaptive_task'], self.parameters['decode_task']] + def get_codebook(self): + decodeTask = self.dataSet.load_analysis_task( + self.parameters['decode_task']) + return decodeTask.get_codebook() + def get_adaptive_thresholds(self): """ Get the adaptive thresholds used for filtering barcodes. From e37fb10f31d36386db801f96e3cf9f600deb0d40 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Mon, 30 Mar 2020 14:09:56 -0400 Subject: [PATCH 29/66] adding test --- merlin/util/barcodefilters.py | 1 + test/test_zplane_duplicate_removal.py | 109 ++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 test/test_zplane_duplicate_removal.py diff --git a/merlin/util/barcodefilters.py b/merlin/util/barcodefilters.py index 45c751c7..e55fc946 100755 --- a/merlin/util/barcodefilters.py +++ b/merlin/util/barcodefilters.py @@ -43,6 +43,7 @@ def remove_zplane_duplicates_all_barcodeids(barcodes: pd.DataFrame, mergedBC = mergedBC.sort_values(by=['barcode_id', 'z']) return mergedBC + def remove_zplane_duplicates_single_barcodeid(barcodes: pd.DataFrame, zPlanes: int, maxDist: float, diff --git a/test/test_zplane_duplicate_removal.py b/test/test_zplane_duplicate_removal.py new file mode 100644 index 00000000..4845666b --- /dev/null +++ b/test/test_zplane_duplicate_removal.py @@ -0,0 +1,109 @@ +import pytest +import pandas as pd +import json +import networkx as nx +import random +import numpy as np +from merlin.util import barcodefilters + +def generate_barcode(fov, barcode_id, x, y, z, mean_intensity): + bc = {'barcode': random.getrandbits(32), + 'barcode_id': barcode_id, + 'fov': fov, + 'mean_intensity': mean_intensity, + 'max_intensity': random.uniform(5, 15), + 'area': random.randint(0, 10), + 'mean_distance': random.random(), + 'min_distance': random.random(), + 'x': x, + 'y': y, + 'z': z, + 'global_x': random.uniform(0, 200000), + 'global_y': random.uniform(0, 200000), + 'global_z': random.uniform(0, 5), + 'cell_index': random.randint(0, 5000)} + + for i in range(16): + bc['intensity_' + str(i)] = random.uniform(5, 15) + + return bc + +b1 = generate_barcode(100, 5, 402.21, 787.11, 3, 14.23) +b2 = generate_barcode(100, 5, 502.21, 687.11, 4.5, 12.23) #same id, different place +b3 = generate_barcode(100, 17, 402.21, 787.11, 3, 10.23) #different id, same place + +b1_above_dimmer = generate_barcode(100, 5, 402.21, 787.11, 4.5, 11.23) +b1_closeby_above_brighter = generate_barcode(100, 5, 403.21, 787.11, 4.5, 15.23) +b2_above_brighter = generate_barcode(100, 5, 502.31, 687.11, 6, 14.23) +b1_closeby_below_brighter = generate_barcode(100, 5, 403.21, 787.11, 1.5, 15.0) +b1_closeby_toofar_brighter = generate_barcode(100, 5, 403.21, 787.11, 0, 15.0) + + +def test_multiple_comparisons_barcodes(): + zplane_cutoff = 1 + xy_cutoff = np.sqrt(2) + zpositions = [0, 1.5, 3, 4.5, 6, 7.5, 9] + + bcSet = [b1, b2, b3, b1_above_dimmer, b1_closeby_above_brighter, + b2_above_brighter, b1_closeby_below_brighter, + b1_closeby_toofar_brighter] + bcDF = pd.DataFrame(bcSet) + expected = [x['barcode'] for x in + [b1_closeby_above_brighter, b2_above_brighter, b3]] + notExpected = [x['barcode'] for x in [b1, b2, b1_above_dimmer, + b1_closeby_below_brighter, + b1_closeby_toofar_brighter]] + + keptBC = remove_zplane_duplicates_all_barcodeids(bcDF, zplane_cutoff, + xy_cutoff, zpositions) + for ex in expected: + assert ex in keptBC['barcode'].values + for notEx in notExpected: + assert notEx not in keptBC['barcode'].values + +def test_all_compatible_barcodes(): + zplane_cutoff = 1 + xy_cutoff = np.sqrt(2) + zpositions = [0, 1.5, 3, 4.5, 6, 7.5, 9] + + bcSet = [b1, b2, b3, b1_closeby_toofar_brighter] + bcDF = pd.DataFrame(bcSet) + expected = [x['barcode'] for x in bcSet] + keptBC = remove_zplane_duplicates_all_barcodeids(bcDF, zplane_cutoff, + xy_cutoff, zpositions) + for ex in expected: + assert ex in keptBC['barcode'].values + assert len(keptBC) == len(bcSet) + +def test_farther_zrange(): + zplane_cutoff = 2 + xy_cutoff = np.sqrt(2) + zpositions = [0, 1.5, 3, 4.5, 6, 7.5, 9] + + bcSet = [b1, b2, b3, b1_closeby_toofar_brighter] + bcDF = pd.DataFrame(bcSet) + expected = [x['barcode'] for x in [b2, b3, b1_closeby_toofar_brighter]] + notExpected = [x['barcode'] for x in [b1]] + keptBC = remove_zplane_duplicates_all_barcodeids(bcDF, zplane_cutoff, + xy_cutoff, zpositions) + for ex in expected: + assert ex in keptBC['barcode'].values + for notEx in notExpected: + assert notEx not in keptBC['barcode'].values + +def test_farther_xyrange(): + zplane_cutoff = 1 + xy_cutoff = np.sqrt(20001) + zpositions = [0, 1.5, 3, 4.5, 6, 7.5, 9] + + bcSet = [b1, b2, b3] + bcDF = pd.DataFrame(bcSet) + expected = [x['barcode'] for x in [b1, b3]] + notExpected = [x['barcode'] for x in [b2]] + keptBC = remove_zplane_duplicates_all_barcodeids(bcDF, zplane_cutoff, + xy_cutoff, zpositions) + for ex in expected: + assert ex in keptBC['barcode'].values + for notEx in notExpected: + assert notEx not in keptBC['barcode'].values + From ba994381d81d2fbd601a280af590948726da8b4b Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Mon, 30 Mar 2020 14:32:37 -0400 Subject: [PATCH 30/66] updating item method to address future warning --- merlin/data/dataorganization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/merlin/data/dataorganization.py b/merlin/data/dataorganization.py index 858841da..5cb02bf1 100755 --- a/merlin/data/dataorganization.py +++ b/merlin/data/dataorganization.py @@ -134,7 +134,7 @@ def get_data_channel_for_bit(self, bitName: str) -> int: Returns: The index of the associated data channel """ - return self.data[self.data['readoutName'] == bitName].index.item() + return self.data[self.data['readoutName'] == bitName].index.values.item() def get_data_channel_with_name(self, channelName: str) -> int: """Get the data channel associated with a gene name. @@ -144,7 +144,7 @@ def get_data_channel_with_name(self, channelName: str) -> int: Returns: The index of the associated data channel """ - return self.data[self.data['channelName'] == channelName].index.item() + return self.data[self.data['channelName'] == channelName].index.values.item() def get_fiducial_filename(self, dataChannel: int, fov: int) -> str: """Get the path for the image file that contains the fiducial From f1a5c8d667233a1ab6f1ea58cbbde8f2892d425f Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Mon, 30 Mar 2020 14:34:24 -0400 Subject: [PATCH 31/66] updating test --- test/test_zplane_duplicate_removal.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/test/test_zplane_duplicate_removal.py b/test/test_zplane_duplicate_removal.py index 4845666b..868c180b 100644 --- a/test/test_zplane_duplicate_removal.py +++ b/test/test_zplane_duplicate_removal.py @@ -1,7 +1,4 @@ -import pytest import pandas as pd -import json -import networkx as nx import random import numpy as np from merlin.util import barcodefilters @@ -29,8 +26,8 @@ def generate_barcode(fov, barcode_id, x, y, z, mean_intensity): return bc b1 = generate_barcode(100, 5, 402.21, 787.11, 3, 14.23) -b2 = generate_barcode(100, 5, 502.21, 687.11, 4.5, 12.23) #same id, different place -b3 = generate_barcode(100, 17, 402.21, 787.11, 3, 10.23) #different id, same place +b2 = generate_barcode(100, 5, 502.21, 687.11, 4.5, 12.23) +b3 = generate_barcode(100, 17, 402.21, 787.11, 3, 10.23) b1_above_dimmer = generate_barcode(100, 5, 402.21, 787.11, 4.5, 11.23) b1_closeby_above_brighter = generate_barcode(100, 5, 403.21, 787.11, 4.5, 15.23) @@ -54,8 +51,8 @@ def test_multiple_comparisons_barcodes(): b1_closeby_below_brighter, b1_closeby_toofar_brighter]] - keptBC = remove_zplane_duplicates_all_barcodeids(bcDF, zplane_cutoff, - xy_cutoff, zpositions) + keptBC = barcodefilters.remove_zplane_duplicates_all_barcodeids( + bcDF, zplane_cutoff, xy_cutoff, zpositions) for ex in expected: assert ex in keptBC['barcode'].values for notEx in notExpected: @@ -69,8 +66,8 @@ def test_all_compatible_barcodes(): bcSet = [b1, b2, b3, b1_closeby_toofar_brighter] bcDF = pd.DataFrame(bcSet) expected = [x['barcode'] for x in bcSet] - keptBC = remove_zplane_duplicates_all_barcodeids(bcDF, zplane_cutoff, - xy_cutoff, zpositions) + keptBC = barcodefilters.remove_zplane_duplicates_all_barcodeids( + bcDF, zplane_cutoff, xy_cutoff, zpositions) for ex in expected: assert ex in keptBC['barcode'].values assert len(keptBC) == len(bcSet) @@ -84,8 +81,8 @@ def test_farther_zrange(): bcDF = pd.DataFrame(bcSet) expected = [x['barcode'] for x in [b2, b3, b1_closeby_toofar_brighter]] notExpected = [x['barcode'] for x in [b1]] - keptBC = remove_zplane_duplicates_all_barcodeids(bcDF, zplane_cutoff, - xy_cutoff, zpositions) + keptBC = barcodefilters.remove_zplane_duplicates_all_barcodeids( + bcDF, zplane_cutoff, xy_cutoff, zpositions) for ex in expected: assert ex in keptBC['barcode'].values for notEx in notExpected: @@ -100,8 +97,8 @@ def test_farther_xyrange(): bcDF = pd.DataFrame(bcSet) expected = [x['barcode'] for x in [b1, b3]] notExpected = [x['barcode'] for x in [b2]] - keptBC = remove_zplane_duplicates_all_barcodeids(bcDF, zplane_cutoff, - xy_cutoff, zpositions) + keptBC = barcodefilters.remove_zplane_duplicates_all_barcodeids( + bcDF, zplane_cutoff, xy_cutoff, zpositions) for ex in expected: assert ex in keptBC['barcode'].values for notEx in notExpected: From c505efc25fd265f5d81303686fdad0d375c8f762 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Mon, 30 Mar 2020 14:51:00 -0400 Subject: [PATCH 32/66] fixing function call --- merlin/util/barcodefilters.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/merlin/util/barcodefilters.py b/merlin/util/barcodefilters.py index e55fc946..4dbe7781 100755 --- a/merlin/util/barcodefilters.py +++ b/merlin/util/barcodefilters.py @@ -35,10 +35,8 @@ def remove_zplane_duplicates_all_barcodeids(barcodes: pd.DataFrame, bcToKeep = [] for bcGroup, bcData in barcodeGroups: bcToKeep.append( - barcodefilters.remove_zplane_duplicates_single_barcodeid(bcData, - zPlanes, - maxDist, - allZPos)) + remove_zplane_duplicates_single_barcodeid(bcData, zPlanes, + maxDist, allZPos)) mergedBC = pd.concat(bcToKeep, 0).reset_index(drop=True) mergedBC = mergedBC.sort_values(by=['barcode_id', 'z']) return mergedBC From f94c64ebe9dbdd97ecfd731b9b7cc356a53920df Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Mon, 30 Mar 2020 15:19:22 -0400 Subject: [PATCH 33/66] changing sequential high pass to false by default --- merlin/analysis/sequential.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/merlin/analysis/sequential.py b/merlin/analysis/sequential.py index df716d01..9a6c241c 100755 --- a/merlin/analysis/sequential.py +++ b/merlin/analysis/sequential.py @@ -21,7 +21,7 @@ def __init__(self, dataSet, parameters=None, analysisName=None): super().__init__(dataSet, parameters, analysisName) if 'apply_highpass' not in self.parameters: - self.parameters['apply_highpass'] = True + self.parameters['apply_highpass'] = False if 'highpass_sigma' not in self.parameters: self.parameters['highpass_sigma'] = 5 if 'z_index' not in self.parameters: From 5d7767fcbe5d11120ef62cf05bef9d2ba8a1bfdc Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Mon, 30 Mar 2020 16:59:40 -0400 Subject: [PATCH 34/66] updating docs --- docs/tasks.rst | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/tasks.rst b/docs/tasks.rst index 6aa2e3bb..fcd0967f 100755 --- a/docs/tasks.rst +++ b/docs/tasks.rst @@ -68,10 +68,9 @@ Parameters: * area\_threshold -- Barcodes with areas below the specified area\_threshold are removed. * intensity\_threshold -- Barcodes with intensities below this threshold are removed. -* remove\_zplane\_duplicates -- Remove putative duplicate barcode counts from adjacent z planes. -* z\_planes\_above\_below -- If removing putative duplicate barcodes, number of adjacent z planes to consider, generally anything within 2 µm would be worth considering. -* max\_centroid\_separation -- If removing putative duplicate barcodes, maximum euclidean distance in xy pixels that can separate the centroids of putative duplicates. - +* remove\_z\_duplicated\_barcodes -- Remove putative duplicate barcode counts from adjacent z planes. +* z\_duplicate\_zPlane\_threshold -- If removing putative duplicate barcodes, number of adjacent z planes to consider, generally anything within 2 µm would be worth considering. +* z\_duplicate\_xy\_pixel\_threshold -- If removing putative duplicate barcodes, maximum euclidean distance in xy pixels that can separate the centroids of putative duplicates. filterbarcodes.GenerateAdaptiveThreshold ------------------------------------------- @@ -91,9 +90,9 @@ Description: Use an adaptive barcode to enrich the decode barcodes for the corre Parameters: * misidentification_rate -- The target misidentification rate, calculated as the number of blank barcodes per blank barcode divided by the number of coding barcodes per coding barcode. -* remove\_zplane\_duplicates -- Remove putative duplicate barcode counts from adjacent z planes. -* z\_planes\_above\_below -- If removing putative duplicate barcodes, number of adjacent z planes to consider, generally anything within 2 µm would be worth considering. -* max\_centroid\_separation -- If removing putative duplicate barcodes, maximum euclidean distance in xy pixels that can separate the centroids of putative duplicates. +* remove\_z\_duplicated\_barcodes -- Remove putative duplicate barcode counts from adjacent z planes. +* z\_duplicate\_zPlane\_threshold -- If removing putative duplicate barcodes, number of adjacent z planes to consider, generally anything within 2 µm would be worth considering. +* z\_duplicate\_xy\_pixel\_threshold -- If removing putative duplicate barcodes, maximum euclidean distance in xy pixels that can separate the centroids of putative duplicates. segment.SegmentCells ---------------------- From 3b3b7835a4a86c2b60c187270233610dbfffff31 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Tue, 31 Mar 2020 10:19:54 -0400 Subject: [PATCH 35/66] pep8 --- merlin/data/dataorganization.py | 6 ++++-- test/test_zplane_duplicate_removal.py | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/merlin/data/dataorganization.py b/merlin/data/dataorganization.py index 5cb02bf1..1fa584d3 100755 --- a/merlin/data/dataorganization.py +++ b/merlin/data/dataorganization.py @@ -134,7 +134,8 @@ def get_data_channel_for_bit(self, bitName: str) -> int: Returns: The index of the associated data channel """ - return self.data[self.data['readoutName'] == bitName].index.values.item() + return self.data[self.data['readoutName'] == + bitName].index.values.item() def get_data_channel_with_name(self, channelName: str) -> int: """Get the data channel associated with a gene name. @@ -144,7 +145,8 @@ def get_data_channel_with_name(self, channelName: str) -> int: Returns: The index of the associated data channel """ - return self.data[self.data['channelName'] == channelName].index.values.item() + return self.data[self.data['channelName'] == + channelName].index.values.item() def get_fiducial_filename(self, dataChannel: int, fov: int) -> str: """Get the path for the image file that contains the fiducial diff --git a/test/test_zplane_duplicate_removal.py b/test/test_zplane_duplicate_removal.py index 868c180b..05e6a523 100644 --- a/test/test_zplane_duplicate_removal.py +++ b/test/test_zplane_duplicate_removal.py @@ -3,6 +3,7 @@ import numpy as np from merlin.util import barcodefilters + def generate_barcode(fov, barcode_id, x, y, z, mean_intensity): bc = {'barcode': random.getrandbits(32), 'barcode_id': barcode_id, @@ -25,6 +26,7 @@ def generate_barcode(fov, barcode_id, x, y, z, mean_intensity): return bc + b1 = generate_barcode(100, 5, 402.21, 787.11, 3, 14.23) b2 = generate_barcode(100, 5, 502.21, 687.11, 4.5, 12.23) b3 = generate_barcode(100, 17, 402.21, 787.11, 3, 10.23) @@ -58,6 +60,7 @@ def test_multiple_comparisons_barcodes(): for notEx in notExpected: assert notEx not in keptBC['barcode'].values + def test_all_compatible_barcodes(): zplane_cutoff = 1 xy_cutoff = np.sqrt(2) @@ -72,6 +75,7 @@ def test_all_compatible_barcodes(): assert ex in keptBC['barcode'].values assert len(keptBC) == len(bcSet) + def test_farther_zrange(): zplane_cutoff = 2 xy_cutoff = np.sqrt(2) @@ -88,6 +92,7 @@ def test_farther_zrange(): for notEx in notExpected: assert notEx not in keptBC['barcode'].values + def test_farther_xyrange(): zplane_cutoff = 1 xy_cutoff = np.sqrt(20001) @@ -103,4 +108,3 @@ def test_farther_xyrange(): assert ex in keptBC['barcode'].values for notEx in notExpected: assert notEx not in keptBC['barcode'].values - From 48183435cd343bad692675088784765b324dfa11 Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Tue, 31 Mar 2020 16:53:27 -0400 Subject: [PATCH 36/66] Increment version number. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fed80ba9..a45ed54b 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ setuptools.setup( name="merlin", - version="0.1.5", + version="0.1.6", description="MERFISH decoding software", author="George Emanuel", author_email="emanuega0@gmail.com", From cde887be7bae0d9c5235f0ea0cab6132ac8f666e Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Wed, 8 Apr 2020 17:08:56 -0400 Subject: [PATCH 37/66] Add Lucy-Richardson deconvolution algorithm that uses projectors as described in 'Accelerating iterative deconvolution and multiview fusion by orders of magnitude', Guo et al, bioRxiv 2019. --- merlin/analysis/preprocess.py | 16 ++++++ merlin/util/deconvolve.py | 95 +++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/merlin/analysis/preprocess.py b/merlin/analysis/preprocess.py index 90fba68b..6e3423e3 100755 --- a/merlin/analysis/preprocess.py +++ b/merlin/analysis/preprocess.py @@ -134,3 +134,19 @@ def _preprocess_image(self, inputImage: np.ndarray) -> np.ndarray: filteredImage, deconFilterSize, self._deconSigma, self._deconIterations).astype(np.uint16) return deconvolvedImage + + +class DeconvolutionPreprocessGuo(Preprocess): + + def _preprocess_image(self, inputImage: np.ndarray) -> np.ndarray: + highPassFilterSize = int(2 * np.ceil(2 * self._highPassSigma) + 1) + deconFilterSize = self.parameters['decon_filter_size'] + + filteredImage = inputImage.astype(float) - cv2.GaussianBlur( + inputImage, (highPassFilterSize, highPassFilterSize), + self._highPassSigma, borderType=cv2.BORDER_REPLICATE) + filteredImage[filteredImage < 0] = 0 + deconvolvedImage = deconvolve.deconvolve_lucyrichardson_guo( + filteredImage, deconFilterSize, self._deconSigma, + self._deconIterations).astype(np.uint16) + return deconvolvedImage diff --git a/merlin/util/deconvolve.py b/merlin/util/deconvolve.py index 787bf5bc..2b3188ed 100644 --- a/merlin/util/deconvolve.py +++ b/merlin/util/deconvolve.py @@ -9,7 +9,66 @@ images. """ +def calculate_projectors(windowSize: int, sigmaG: float) -> list: + """Calculate forward and backward projectors as described in: + 'Accelerating iterative deconvolution and multiview fusion by orders + of magnitude', Guo et al, bioRxiv 2019. + + Args: + windowSize: the size of the window over which to perform the gaussian. + This must be an odd number. + sigmaG: the standard deviation of the Gaussian point spread function + + Returns: + A list containing the forward and backward projectors to use for + Lucy-Richardson deconvolution. + """ + pf = matlab.matlab_gauss2D(shape = (windowSize, windowSize), + sigma = sigmaG) + pfFFT = np.fft.fft2(pf) + + # Wiener-Butterworth back projector. + # + # These values are from Guo et al. + alpha = 0.001 + beta = 0.001 + n = 8 + + # This is the cut-off frequency. + kc = 1.0/(0.5 * 2.355 * sigmaG) + + # FFT frequencies + kv = np.fft.fftfreq(pfFFT.shape[0]) + + kx = np.zeros((kv.size, kv.size)) + for i in range(kv.size): + kx[i,:] = np.copy(kv) + + ky = np.transpose(kx) + kk = np.sqrt(kx*kx + ky*ky) + + # Wiener filter + bWiener = pfFFT/(np.abs(pfFFT) * np.abs(pfFFT) + alpha) + + # Buttersworth filter + eps = np.sqrt(1.0/(beta*beta) - 1) + + kkSqr = kk*kk/(kc*kc) + bBWorth = 1.0/np.sqrt(1.0 + eps * eps * np.power(kkSqr, n)) + + # Weiner-Butterworth back projector + pbFFT = bWiener * bBWorth + + # back projector. + pb = np.real(np.fft.ifft2(pbFFT)) + + # normalize. + pb = pb * np.sum(pf)/np.sum(pb) + + return [pf, pb] + + def deconvolve_lucyrichardson(image: np.ndarray, windowSize: int, sigmaG: float, iterationCount: int) -> np.ndarray: """Performs Lucy-Richardson deconvolution on the provided image using a @@ -74,3 +133,39 @@ def deconvolve_lucyrichardson(image: np.ndarray, windowSize: int, sigmaG: float, return J1 +def deconvolve_lucyrichardson_guo(image: np.ndarray, + windowSize: int, + sigmaG: float, + iterationCount: int) -> np.ndarray: + """Performs Lucy-Richardson deconvolution on the provided image using a + Gaussian point spread function. This version used the optimized + deconvolution approach described in: + + + 'Accelerating iterative deconvolution and multiview fusion by orders + of magnitude', Guo et al, bioRxiv 2019. + + Args: + image: the input image to be deconvolved + windowSize: the size of the window over which to perform the gaussian. + This must be an odd number. + sigmaG: the standard deviation of the Gaussian point spread function + iterationCount: the number of iterations to perform + + Returns: + the deconvolved image + """ + [pf, pb] = calculate_projectors(windowSize, sigmaG) + + eps = 1.0e-3 + + ek = np.copy(image) + np.clip(ek, eps, None, ek) + + for i in range(iterationCount): + ekf = cv2.filter2D(ek, -1, pf, borderType=cv2.BORDER_REPLICATE) + np.clip(ekf, eps, None, ekf) + ek = ek*cv2.filter2D(image/ekf, -1, pb, borderType=cv2.BORDER_REPLICATE) + np.clip(ek, eps, None, ek) + + return ek From 9dd91d7eb419fb37ad293d0d3f836251ee820dc1 Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Wed, 8 Apr 2020 20:03:03 -0400 Subject: [PATCH 38/66] Fix super-class. Add FIXME about setting 'decon_iterations'. --- merlin/analysis/preprocess.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/merlin/analysis/preprocess.py b/merlin/analysis/preprocess.py index 6e3423e3..82f67bb2 100755 --- a/merlin/analysis/preprocess.py +++ b/merlin/analysis/preprocess.py @@ -136,8 +136,21 @@ def _preprocess_image(self, inputImage: np.ndarray) -> np.ndarray: return deconvolvedImage -class DeconvolutionPreprocessGuo(Preprocess): - +class DeconvolutionPreprocessGuo(DeconvolutionPreprocess): + + def __init__(self, dataSet, parameters=None, analysisName=None): + super().__init__(dataSet, parameters, analysisName) + + # FIXME: This doesn't work as hoped because 'decon_iterations' + # is added in the super-class. So we end up with 20 + # iterations instead of the 2 that we want to be the + # default. + # + if 'decon_iterations' not in self.parameters: + self.parameters['decon_iterations'] = 2 + + self._deconIterations = self.parameters['decon_iterations'] + def _preprocess_image(self, inputImage: np.ndarray) -> np.ndarray: highPassFilterSize = int(2 * np.ceil(2 * self._highPassSigma) + 1) deconFilterSize = self.parameters['decon_filter_size'] From fa5376ffb520caa1df0cd37747ea6e70a9b6d74b Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Thu, 9 Apr 2020 08:49:57 -0400 Subject: [PATCH 39/66] Update FIXME. --- merlin/analysis/preprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/merlin/analysis/preprocess.py b/merlin/analysis/preprocess.py index 82f67bb2..d4d6c1f2 100755 --- a/merlin/analysis/preprocess.py +++ b/merlin/analysis/preprocess.py @@ -144,7 +144,8 @@ def __init__(self, dataSet, parameters=None, analysisName=None): # FIXME: This doesn't work as hoped because 'decon_iterations' # is added in the super-class. So we end up with 20 # iterations instead of the 2 that we want to be the - # default. + # default. And we can't do this before calling the + # super() because then self.parameters won't exist. # if 'decon_iterations' not in self.parameters: self.parameters['decon_iterations'] = 2 From 0cfd0f7f149a19df875d5174ab2c3ce3e28be6a7 Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Thu, 9 Apr 2020 11:09:30 -0400 Subject: [PATCH 40/66] Fix default value for 'decon_iterations' as suggested by George E. Improve PEP8 compliance. --- merlin/analysis/preprocess.py | 22 +++++++++----------- merlin/util/deconvolve.py | 38 ++++++++++++++++++++--------------- 2 files changed, 32 insertions(+), 28 deletions(-) diff --git a/merlin/analysis/preprocess.py b/merlin/analysis/preprocess.py index d4d6c1f2..bcc1c8fb 100755 --- a/merlin/analysis/preprocess.py +++ b/merlin/analysis/preprocess.py @@ -11,19 +11,19 @@ class Preprocess(analysistask.ParallelAnalysisTask): """ - An abstract class for preparing data for barcode calling. + An abstract class for preparing data for barcode calling. """ def _image_name(self, fov): destPath = self.dataSet.get_analysis_subdirectory( self.analysisName, subdirectory='preprocessed_images') return os.sep.join([destPath, 'fov_' + str(fov) + '.tif']) - + def get_pixel_histogram(self, fov=None): if fov is not None: return self.dataSet.load_numpy_analysis_result( 'pixel_histogram', self.analysisName, fov, 'histograms') - + pixelHistogram = np.zeros(self.get_pixel_histogram( self.dataSet.get_fovs()[0]).shape) for f in self.dataSet.get_fovs(): @@ -62,7 +62,7 @@ def __init__(self, dataSet, parameters=None, analysisName=None): def fragment_count(self): return len(self.dataSet.get_fovs()) - + def get_estimated_memory(self): return 2048 @@ -141,17 +141,15 @@ class DeconvolutionPreprocessGuo(DeconvolutionPreprocess): def __init__(self, dataSet, parameters=None, analysisName=None): super().__init__(dataSet, parameters, analysisName) - # FIXME: This doesn't work as hoped because 'decon_iterations' - # is added in the super-class. So we end up with 20 - # iterations instead of the 2 that we want to be the - # default. And we can't do this before calling the - # super() because then self.parameters won't exist. - # - if 'decon_iterations' not in self.parameters: + # Check for 'decon_iterations' in parameters instead of + # self.parameters as 'decon_iterations' is added to + # self.parameters by the super-class with a default value + # of 20, but we want the default value to be 2. + if 'decon_iterations' not in parameters: self.parameters['decon_iterations'] = 2 self._deconIterations = self.parameters['decon_iterations'] - + def _preprocess_image(self, inputImage: np.ndarray) -> np.ndarray: highPassFilterSize = int(2 * np.ceil(2 * self._highPassSigma) + 1) deconFilterSize = self.parameters['decon_filter_size'] diff --git a/merlin/util/deconvolve.py b/merlin/util/deconvolve.py index 2b3188ed..e1840ce0 100644 --- a/merlin/util/deconvolve.py +++ b/merlin/util/deconvolve.py @@ -9,11 +9,12 @@ images. """ + def calculate_projectors(windowSize: int, sigmaG: float) -> list: """Calculate forward and backward projectors as described in: - 'Accelerating iterative deconvolution and multiview fusion by orders - of magnitude', Guo et al, bioRxiv 2019. + 'Accelerating iterative deconvolution and multiview fusion by orders + of magnitude', Guo et al, bioRxiv 2019. Args: windowSize: the size of the window over which to perform the gaussian. @@ -24,10 +25,10 @@ def calculate_projectors(windowSize: int, sigmaG: float) -> list: A list containing the forward and backward projectors to use for Lucy-Richardson deconvolution. """ - pf = matlab.matlab_gauss2D(shape = (windowSize, windowSize), - sigma = sigmaG) + pf = matlab.matlab_gauss2D(shape=(windowSize, windowSize), + sigma=sigmaG) pfFFT = np.fft.fft2(pf) - + # Wiener-Butterworth back projector. # # These values are from Guo et al. @@ -43,7 +44,7 @@ def calculate_projectors(windowSize: int, sigmaG: float) -> list: kx = np.zeros((kv.size, kv.size)) for i in range(kv.size): - kx[i,:] = np.copy(kv) + kx[i, :] = np.copy(kv) ky = np.transpose(kx) kk = np.sqrt(kx*kx + ky*ky) @@ -59,17 +60,19 @@ def calculate_projectors(windowSize: int, sigmaG: float) -> list: # Weiner-Butterworth back projector pbFFT = bWiener * bBWorth - + # back projector. pb = np.real(np.fft.ifft2(pbFFT)) - + # normalize. pb = pb * np.sum(pf)/np.sum(pb) - + return [pf, pb] - -def deconvolve_lucyrichardson(image: np.ndarray, windowSize: int, sigmaG: float, + +def deconvolve_lucyrichardson(image: np.ndarray, + windowSize: int, + sigmaG: float, iterationCount: int) -> np.ndarray: """Performs Lucy-Richardson deconvolution on the provided image using a Gaussian point spread function. @@ -142,7 +145,7 @@ def deconvolve_lucyrichardson_guo(image: np.ndarray, deconvolution approach described in: - 'Accelerating iterative deconvolution and multiview fusion by orders + 'Accelerating iterative deconvolution and multiview fusion by orders of magnitude', Guo et al, bioRxiv 2019. Args: @@ -158,14 +161,17 @@ def deconvolve_lucyrichardson_guo(image: np.ndarray, [pf, pb] = calculate_projectors(windowSize, sigmaG) eps = 1.0e-3 - + ek = np.copy(image) np.clip(ek, eps, None, ek) for i in range(iterationCount): - ekf = cv2.filter2D(ek, -1, pf, borderType=cv2.BORDER_REPLICATE) + ekf = cv2.filter2D(ek, -1, pf, + borderType=cv2.BORDER_REPLICATE) np.clip(ekf, eps, None, ekf) - ek = ek*cv2.filter2D(image/ekf, -1, pb, borderType=cv2.BORDER_REPLICATE) + + ek = ek*cv2.filter2D(image/ekf, -1, pb, + borderType=cv2.BORDER_REPLICATE) np.clip(ek, eps, None, ek) - + return ek From 13603382ae031fecdee670c424c4ee70bec3b44e Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Thu, 9 Apr 2020 17:42:12 -0400 Subject: [PATCH 41/66] Add DeconvolutionPreprocessGuo to the documentation. Update CHANGELOG. --- CHANGELOG.md | 6 +++++- docs/tasks.rst | 13 +++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61272305..0e0b5c70 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,4 +36,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Updated the filemap to only store the file name so that it can easily be pointed to new data home directories. This change maintains backward compatibility. - Improved decoding speed ### Added -- Parameters to filter tasks that enable removing barcodes that were putatively duplicated across adjacent z planes. +- Parameters to filter tasks that enable removing barcodes that were putatively duplicated across adjacent z planes. + +## [0.1.6] - +### Added +- An alternative Lucy-Richardson deconvolution approach that requires ~10x fewer iterations. diff --git a/docs/tasks.rst b/docs/tasks.rst index fcd0967f..66a246b0 100755 --- a/docs/tasks.rst +++ b/docs/tasks.rst @@ -35,6 +35,19 @@ Parameters: * decon\_iterations -- The number of Lucy-Richardson deconvolution iterations to perform on each image. * decon\_filter\_size -- The size of the gaussian filter to use for the deconvolution. It is not recommended to set this parameter manually. +preprocess.DeconvolutionPreprocessGuo +-------------------------------------- + +Description: High-pass filters and deconvolves the image data in preparation for bit-calling. This version uses the Lucy-Richardson deconvolution approach described in this reference - `Guo et al. `. + +Parameters: + +* warp\_task -- The name of the warp task that provides the aligned image stacks. +* highpass\_pass -- The standard deviation to use for the high pass filter. +* decon\_sigma -- The standard deviation to use for the Lucy-Richardson deconvolution. +* decon\_iterations -- The number of Lucy-Richardson deconvolution iterations to perform on each image. The default value is 2. +* decon\_filter\_size -- The size of the gaussian filter to use for the deconvolution. It is not recommended to set this parameter manually. + optimize.Optimize ------------------ From 6217acaf333bf2d6595305396bac5f4f04024e88 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Fri, 10 Apr 2020 10:49:57 -0400 Subject: [PATCH 42/66] addressing edge case of no barcodes and correcting z position indexing error --- merlin/analysis/filterbarcodes.py | 12 ++++--- merlin/util/barcodefilters.py | 54 ++++++++++++++++++------------- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index 744f52f7..ca0c315d 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -77,8 +77,8 @@ def _run_analysis(self, fragmentIndex): .get_filtered_barcodes(areaThreshold, intensityThreshold, distanceThreshold=distanceThreshold, fov=fragmentIndex) - currentBC = self.remove_z_duplicate_barcodes(currentBC) - barcodeDB.write_barcodes(currentBC, fov=fragmentIndex) + barcodeDB.write_barcodes( + self.remove_z_duplicate_barcodes(currentBC), fov=fragmentIndex) class GenerateAdaptiveThreshold(analysistask.AnalysisTask): @@ -395,7 +395,9 @@ def _run_analysis(self, fragmentIndex): bcDatabase = self.get_barcode_database() currentBarcodes = decodeTask.get_barcode_database()\ .get_barcodes(fragmentIndex) - currentBarcodes = self.remove_z_duplicate_barcodes(currentBarcodes) + + currentBarcodes = adaptiveTask.extract_barcodes_with_threshold( + threshold, currentBarcodes) bcDatabase.write_barcodes( - adaptiveTask.extract_barcodes_with_threshold( - threshold, currentBarcodes), fov=fragmentIndex) + self.remove_z_duplicate_barcodes(currentBarcodes), + fov=fragmentIndex) diff --git a/merlin/util/barcodefilters.py b/merlin/util/barcodefilters.py index 4dbe7781..36431e85 100755 --- a/merlin/util/barcodefilters.py +++ b/merlin/util/barcodefilters.py @@ -31,15 +31,18 @@ def remove_zplane_duplicates_all_barcodeids(barcodes: pd.DataFrame, fall within parameters of z plane duplicates have been removed. """ - barcodeGroups = barcodes.groupby('barcode_id') - bcToKeep = [] - for bcGroup, bcData in barcodeGroups: - bcToKeep.append( - remove_zplane_duplicates_single_barcodeid(bcData, zPlanes, - maxDist, allZPos)) - mergedBC = pd.concat(bcToKeep, 0).reset_index(drop=True) - mergedBC = mergedBC.sort_values(by=['barcode_id', 'z']) - return mergedBC + if len(barcodes) == 0: + return barcodes + else: + barcodeGroups = barcodes.groupby('barcode_id') + bcToKeep = [] + for bcGroup, bcData in barcodeGroups: + bcToKeep.append( + remove_zplane_duplicates_single_barcodeid(bcData, zPlanes, + maxDist, allZPos)) + mergedBC = pd.concat(bcToKeep, 0).reset_index(drop=True) + mergedBC = mergedBC.sort_values(by=['barcode_id', 'z']) + return mergedBC def remove_zplane_duplicates_single_barcodeid(barcodes: pd.DataFrame, @@ -72,21 +75,26 @@ def remove_zplane_duplicates_single_barcodeid(barcodes: pd.DataFrame, graph = nx.Graph() zPos = sorted(allZPos) graph.add_nodes_from(barcodes.index.values.tolist()) - for i in range(0, len(zPos)): - z = zPos[i] - zToCompare = [otherZ for pos, otherZ in enumerate(zPos) if - (pos >= i - zPlanes) & (pos <= i + zPlanes) & ~(pos == i)] + for z in range(0, len(zPos)): + zToCompare = [pos for pos, otherZ in enumerate(zPos) if + (pos >= z - zPlanes) & (pos <= z + zPlanes) & ~(pos == z)] treeBC = barcodes[barcodes['z'] == z] - tree = cKDTree(treeBC.loc[:, ['x', 'y']].values) - for compZ in zToCompare: - queryBC = barcodes[barcodes['z'] == compZ] - dist, idx = tree.query(queryBC.loc[:, ['x', 'y']].values, k=1, - distance_upper_bound=maxDist) - currentHits = treeBC.index.values[idx[np.isfinite(dist)]] - comparisonHits = queryBC.index.values[np.isfinite(dist)] - graph.add_edges_from(list(zip(currentHits, comparisonHits))) - connectedComponents = [list(x) for x in - list(nx.connected_components(graph))] + if len(treeBC) == 0: + pass + else: + tree = cKDTree(treeBC.loc[:, ['x', 'y']].values) + for compZ in zToCompare: + queryBC = barcodes[barcodes['z'] == compZ] + if len(queryBC) == 0: + pass + else: + dist, idx = tree.query(queryBC.loc[:, ['x', 'y']].values, + k=1, distance_upper_bound=maxDist) + currentHits = treeBC.index.values[idx[np.isfinite(dist)]] + comparisonHits = queryBC.index.values[np.isfinite(dist)] + graph.add_edges_from(list(zip(currentHits, comparisonHits))) + connectedComponents = [list(x) for x in + list(nx.connected_components(graph))] def choose_brighter_barcode(barcodes, indexes): sortedBC = barcodes.loc[indexes, :].sort_values(by='mean_intensity', From 1855b123bb6e19935fb1f12a0f159cf4717c96c0 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Fri, 10 Apr 2020 11:02:01 -0400 Subject: [PATCH 43/66] updating test z barcode z indexes to reflect typical barcode z indexing --- test/test_zplane_duplicate_removal.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/test_zplane_duplicate_removal.py b/test/test_zplane_duplicate_removal.py index 05e6a523..5bfcd8e7 100644 --- a/test/test_zplane_duplicate_removal.py +++ b/test/test_zplane_duplicate_removal.py @@ -27,14 +27,14 @@ def generate_barcode(fov, barcode_id, x, y, z, mean_intensity): return bc -b1 = generate_barcode(100, 5, 402.21, 787.11, 3, 14.23) -b2 = generate_barcode(100, 5, 502.21, 687.11, 4.5, 12.23) -b3 = generate_barcode(100, 17, 402.21, 787.11, 3, 10.23) - -b1_above_dimmer = generate_barcode(100, 5, 402.21, 787.11, 4.5, 11.23) -b1_closeby_above_brighter = generate_barcode(100, 5, 403.21, 787.11, 4.5, 15.23) -b2_above_brighter = generate_barcode(100, 5, 502.31, 687.11, 6, 14.23) -b1_closeby_below_brighter = generate_barcode(100, 5, 403.21, 787.11, 1.5, 15.0) +b1 = generate_barcode(100, 5, 402.21, 787.11, 2, 14.23) +b2 = generate_barcode(100, 5, 502.21, 687.11, 3, 12.23) +b3 = generate_barcode(100, 17, 402.21, 787.11, 2, 10.23) + +b1_above_dimmer = generate_barcode(100, 5, 402.21, 787.11, 3, 11.23) +b1_closeby_above_brighter = generate_barcode(100, 5, 403.21, 787.11, 3, 15.23) +b2_above_brighter = generate_barcode(100, 5, 502.31, 687.11, 4, 14.23) +b1_closeby_below_brighter = generate_barcode(100, 5, 403.21, 787.11, 1, 15.0) b1_closeby_toofar_brighter = generate_barcode(100, 5, 403.21, 787.11, 0, 15.0) From e38e9b25659769d64f61253124cdbf65c561a6ba Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Fri, 10 Apr 2020 11:02:19 -0400 Subject: [PATCH 44/66] updating changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61272305..38171e23 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,3 +37,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Improved decoding speed ### Added - Parameters to filter tasks that enable removing barcodes that were putatively duplicated across adjacent z planes. + +## [0.1.6] - +### Fixed +- Fixed bug and edge cases in removal of barcodes duplicated across z planes From d09954d36632217d0ed539b0bfbf7e4e7b67e5ee Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Fri, 10 Apr 2020 17:54:38 -0400 Subject: [PATCH 45/66] Rename filter module to imagefilters. Change high_pass_filter function to use cv2.GaussianBlur(). Update sequential for changes to high_pass_filter arguments. --- merlin/analysis/sequential.py | 9 ++++++--- merlin/util/filter.py | 22 ---------------------- merlin/util/imagefilters.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 25 deletions(-) delete mode 100755 merlin/util/filter.py create mode 100755 merlin/util/imagefilters.py diff --git a/merlin/analysis/sequential.py b/merlin/analysis/sequential.py index 9a6c241c..f2995fcd 100755 --- a/merlin/analysis/sequential.py +++ b/merlin/analysis/sequential.py @@ -6,7 +6,7 @@ from skimage.measure import regionprops from merlin.core import analysistask -from merlin.util import filter +from merlin.util import imagefilters class SumSignal(analysistask.ParallelAnalysisTask): @@ -92,8 +92,11 @@ def _get_sum_signal(self, fov, channels, zIndex): for ch in channels: img = fTask.get_aligned_image(fov, ch, zIndex) if self.highpass: - img = filter.high_pass_filter( - img, self.parameters['highpass_sigma']) + highPassSigma = self.parameters['highpass_sigma'] + highPassFilterSize = int(2 * np.ceil(3 * highPassSigma) + 1) + img = imagefilters.high_pass_filter(img, + highPassFilterSize, + highPassSigma) signals.append(self._extract_signal(cells, img, zIndex).iloc[:, [0]]) diff --git a/merlin/util/filter.py b/merlin/util/filter.py deleted file mode 100755 index 6ddded4f..00000000 --- a/merlin/util/filter.py +++ /dev/null @@ -1,22 +0,0 @@ -from scipy import ndimage -import numpy as np - -""" -This module contains code for performing filtering operations on images -""" - - -def high_pass_filter(image: np.ndarray, lowPassKernel: int) -> np.ndarray: - """ - Args: - image: the input image to be filtered - lowPassKernel: the size of the gaussian kernel to use for low pass. - - Returns: - the high pass filtered image image - """ - img = image.astype(np.int16) - lowpass = ndimage.gaussian_filter(img, lowPassKernel) - gauss_highpass = img - lowpass - gauss_highpass[gauss_highpass < 0] = 0 - return gauss_highpass diff --git a/merlin/util/imagefilters.py b/merlin/util/imagefilters.py new file mode 100755 index 00000000..61104fd7 --- /dev/null +++ b/merlin/util/imagefilters.py @@ -0,0 +1,28 @@ +import cv2 +import numpy as np + +""" +This module contains code for performing filtering operations on images +""" + + +def high_pass_filter(image: np.ndarray, + windowSize: int, + sigma: float) -> np.ndarray: + """ + Args: + image: the input image to be filtered + windowSize: the size of the Gaussian kernel to use. + sigma: the sigma of the Gaussian. + + Returns: + the high pass filtered image image + """ + img = image.astype(np.int16) + lowpass = cv2.GaussianBlur(img, + (windowSize, windowSize), + sigma, + borderType=cv2.BORDER_REPLICATE) + gauss_highpass = img - lowpass + gauss_highpass[gauss_highpass < 0] = 0 + return gauss_highpass From 7beed889f72ae152c21e13b8ef1d2751a3a5e44d Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Fri, 10 Apr 2020 17:56:07 -0400 Subject: [PATCH 46/66] Move image high pass filtering into it's own method. Use imagefilters.high_pass_filter() function. --- merlin/analysis/preprocess.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/merlin/analysis/preprocess.py b/merlin/analysis/preprocess.py index bcc1c8fb..e163ac6f 100755 --- a/merlin/analysis/preprocess.py +++ b/merlin/analysis/preprocess.py @@ -5,6 +5,7 @@ from merlin.core import analysistask from merlin.util import deconvolve from merlin.util import aberration +from merlin.util import imagefilters from merlin.data import codebook @@ -99,6 +100,12 @@ def get_processed_image( chromaticCorrector) return self._preprocess_image(inputImage) + def _high_pass_filter(self, inputImage: np.ndarray) -> np.ndarray: + highPassFilterSize = int(2 * np.ceil(2 * self._highPassSigma) + 1) + return imagefilters.high_pass_filter(inputImage, + highPassFilterSize, + self._highPassSigma) + def _run_analysis(self, fragmentIndex): warpTask = self.dataSet.load_analysis_task( self.parameters['warp_task']) @@ -123,13 +130,9 @@ def _run_analysis(self, fragmentIndex): self._save_pixel_histogram(pixelHistogram, fragmentIndex) def _preprocess_image(self, inputImage: np.ndarray) -> np.ndarray: - highPassFilterSize = int(2 * np.ceil(2 * self._highPassSigma) + 1) deconFilterSize = self.parameters['decon_filter_size'] - - filteredImage = inputImage.astype(float) - cv2.GaussianBlur( - inputImage, (highPassFilterSize, highPassFilterSize), - self._highPassSigma, borderType=cv2.BORDER_REPLICATE) - filteredImage[filteredImage < 0] = 0 + + filteredImage = self._high_pass_filter(inputImage) deconvolvedImage = deconvolve.deconvolve_lucyrichardson( filteredImage, deconFilterSize, self._deconSigma, self._deconIterations).astype(np.uint16) @@ -151,13 +154,9 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self._deconIterations = self.parameters['decon_iterations'] def _preprocess_image(self, inputImage: np.ndarray) -> np.ndarray: - highPassFilterSize = int(2 * np.ceil(2 * self._highPassSigma) + 1) deconFilterSize = self.parameters['decon_filter_size'] - filteredImage = inputImage.astype(float) - cv2.GaussianBlur( - inputImage, (highPassFilterSize, highPassFilterSize), - self._highPassSigma, borderType=cv2.BORDER_REPLICATE) - filteredImage[filteredImage < 0] = 0 + filteredImage = self._high_pass_filter(inputImage) deconvolvedImage = deconvolve.deconvolve_lucyrichardson_guo( filteredImage, deconFilterSize, self._deconSigma, self._deconIterations).astype(np.uint16) From e1f7b1c19d12ecc08816bf577d1b407269ff160b Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Fri, 10 Apr 2020 17:57:29 -0400 Subject: [PATCH 47/66] Remove normalization as the back projector is already normalized. Reduce eps and add a maximum to numpy.clip(). --- merlin/util/deconvolve.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/merlin/util/deconvolve.py b/merlin/util/deconvolve.py index e1840ce0..a320a0c7 100644 --- a/merlin/util/deconvolve.py +++ b/merlin/util/deconvolve.py @@ -64,9 +64,6 @@ def calculate_projectors(windowSize: int, sigmaG: float) -> list: # back projector. pb = np.real(np.fft.ifft2(pbFFT)) - # normalize. - pb = pb * np.sum(pf)/np.sum(pb) - return [pf, pb] @@ -144,7 +141,6 @@ def deconvolve_lucyrichardson_guo(image: np.ndarray, Gaussian point spread function. This version used the optimized deconvolution approach described in: - 'Accelerating iterative deconvolution and multiview fusion by orders of magnitude', Guo et al, bioRxiv 2019. @@ -160,7 +156,8 @@ def deconvolve_lucyrichardson_guo(image: np.ndarray, """ [pf, pb] = calculate_projectors(windowSize, sigmaG) - eps = 1.0e-3 + eps = 1.0e-6 + i_max = 2**16-1 ek = np.copy(image) np.clip(ek, eps, None, ek) @@ -168,10 +165,10 @@ def deconvolve_lucyrichardson_guo(image: np.ndarray, for i in range(iterationCount): ekf = cv2.filter2D(ek, -1, pf, borderType=cv2.BORDER_REPLICATE) - np.clip(ekf, eps, None, ekf) + np.clip(ekf, eps, i_max, ekf) ek = ek*cv2.filter2D(image/ekf, -1, pb, borderType=cv2.BORDER_REPLICATE) - np.clip(ek, eps, None, ek) + np.clip(ek, eps, i_max, ek) return ek From 61212f7b88ad6f400f3bf90ad0784ce97b443b86 Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Fri, 10 Apr 2020 18:12:18 -0400 Subject: [PATCH 48/66] Cast high pass image to float for deconvolve_lucyrichardson() function. --- merlin/analysis/preprocess.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/merlin/analysis/preprocess.py b/merlin/analysis/preprocess.py index e163ac6f..acfb9660 100755 --- a/merlin/analysis/preprocess.py +++ b/merlin/analysis/preprocess.py @@ -102,9 +102,10 @@ def get_processed_image( def _high_pass_filter(self, inputImage: np.ndarray) -> np.ndarray: highPassFilterSize = int(2 * np.ceil(2 * self._highPassSigma) + 1) - return imagefilters.high_pass_filter(inputImage, - highPassFilterSize, - self._highPassSigma) + hpImage = imagefilters.high_pass_filter(inputImage, + highPassFilterSize, + self._highPassSigma) + return hpImage.astype(np.float) def _run_analysis(self, fragmentIndex): warpTask = self.dataSet.load_analysis_task( From 4f3f481135a177e65fd46b6c4d47cff07d115024 Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Fri, 10 Apr 2020 18:14:57 -0400 Subject: [PATCH 49/66] Fix whitespace. --- merlin/analysis/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/merlin/analysis/preprocess.py b/merlin/analysis/preprocess.py index acfb9660..32a904f9 100755 --- a/merlin/analysis/preprocess.py +++ b/merlin/analysis/preprocess.py @@ -132,7 +132,7 @@ def _run_analysis(self, fragmentIndex): def _preprocess_image(self, inputImage: np.ndarray) -> np.ndarray: deconFilterSize = self.parameters['decon_filter_size'] - + filteredImage = self._high_pass_filter(inputImage) deconvolvedImage = deconvolve.deconvolve_lucyrichardson( filteredImage, deconFilterSize, self._deconSigma, From 1b59e3dc4775fc0223776f1860b7fd40d88bf640 Mon Sep 17 00:00:00 2001 From: George Emanuel Date: Fri, 10 Apr 2020 20:08:55 -0400 Subject: [PATCH 50/66] Changed high_pass_filter to maintain the data type of the input image. --- merlin/util/imagefilters.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/merlin/util/imagefilters.py b/merlin/util/imagefilters.py index 61104fd7..15e62185 100755 --- a/merlin/util/imagefilters.py +++ b/merlin/util/imagefilters.py @@ -16,13 +16,13 @@ def high_pass_filter(image: np.ndarray, sigma: the sigma of the Gaussian. Returns: - the high pass filtered image image + the high pass filtered image. The returned image is the same type + as the input image. """ - img = image.astype(np.int16) - lowpass = cv2.GaussianBlur(img, + lowpass = cv2.GaussianBlur(image, (windowSize, windowSize), sigma, borderType=cv2.BORDER_REPLICATE) - gauss_highpass = img - lowpass - gauss_highpass[gauss_highpass < 0] = 0 + gauss_highpass = image - lowpass + gauss_highpass[lowpass > image] = 0 return gauss_highpass From 2fa7927ddb1b52f64c73a5e6f7fd8d64ec8098a4 Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Sat, 11 Apr 2020 11:37:44 -0400 Subject: [PATCH 51/66] Add deconvolution tests. --- test/test_decon.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 test/test_decon.py diff --git a/test/test_decon.py b/test/test_decon.py new file mode 100644 index 00000000..084f8322 --- /dev/null +++ b/test/test_decon.py @@ -0,0 +1,62 @@ +import cv2 +import numpy as np +import random + +import merlin.util.deconvolve as deconvolve +import merlin.util.matlab as matlab + + +decon_sigma = 2 +decon_filter_size = 9 + + +def decon_diff(image, gt_image): + on_gt = np.sum(image[(gt_image > 0)]) + off_gt = np.sum(image[gt_image == 0]) + + return (on_gt/(on_gt + off_gt)) + + +def make_image(): + # Always make the same image. + random.seed(42) + + # Ground truth. + gt_image = np.zeros((100, 150)) + for i in range(40): + x = random.randint(5, 95) + y = random.randint(5, 145) + gt_image[x, y] = random.randint(10, 50) + + [pf, pb] = deconvolve.calculate_projectors(64, decon_sigma) + image = cv2.filter2D(gt_image, -1, pf, borderType=cv2.BORDER_REPLICATE) + + return [image, gt_image] + + +def test_deconvolve_lucyrichardson(): + [image, gt_image] = make_image() + + d1 = decon_diff(image, gt_image) + d_image = deconvolve.deconvolve_lucyrichardson(image, + decon_filter_size, + decon_sigma, + 20) + d2 = decon_diff(d_image, gt_image) + print(d1, d2) + + assert (d2 > d1) + + +def test_deconvolve_lucyrichardson_guo(): + [image, gt_image] = make_image() + + d1 = decon_diff(image, gt_image) + d_image = deconvolve.deconvolve_lucyrichardson_guo(image, + decon_filter_size, + decon_sigma, + 2) + d2 = decon_diff(d_image, gt_image) + print(d1, d2) + + assert (d2 > d1) From 0af265e395750c9c69ee848d0024639e6a25ca51 Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Mon, 13 Apr 2020 15:15:14 -0400 Subject: [PATCH 52/66] Add optional 'fov_index' parameter that specifies which fov and z sections to optimize on if deterministic results are needed. --- merlin/analysis/optimize.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/merlin/analysis/optimize.py b/merlin/analysis/optimize.py index 81787fa7..00f73084 100755 --- a/merlin/analysis/optimize.py +++ b/merlin/analysis/optimize.py @@ -32,6 +32,12 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self.parameters['optimize_chromatic_correction'] = False if 'crop_width' not in self.parameters: self.parameters['crop_width'] = 0 + if 'fov_index' in self.parameters: + # Set iterations number to length of fov_index if not + # None. Should we warn the user? + # + self.parameters['fov_per_iteration'] = \ + len(self.parameters['fov_index']) def get_estimated_memory(self): return 4000 @@ -59,9 +65,25 @@ def _run_analysis(self, fragmentIndex): self.parameters['preprocess_task']) codebook = self.get_codebook() - fovIndex = np.random.choice(list(self.dataSet.get_fovs())) - zIndex = np.random.choice( - list(range(len(self.dataSet.get_z_positions())))) + # Use random FOV and Z index if the user did not specify. + if 'fov_index' not in self.parameters: + fovIndex = np.random.choice(list(self.dataSet.get_fovs())) + zIndex = np.random.choice( + list(range(len(self.dataSet.get_z_positions())))) + + # Otherwise use FOV and Z indices specified by the user. + else: + tmp = self.parameters['fov_index'][fragmentIndex] + + # Two element list is FOV, z index. + if isinstance(tmp, list): + fovIndex = tmp[0] + zIndex = tmp[1] + + # Otherwise this is the FOV index. + else: + fovIndex = tmp + zIndex = 0 scaleFactors = self._get_previous_scale_factors() backgrounds = self._get_previous_backgrounds() From d8a60fca304266bf34d29f844d37390e71c95f4f Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Wed, 15 Apr 2020 12:21:59 -0400 Subject: [PATCH 53/66] Add documentation for the fov_index parameter. Move random choice of fov and Z value to __init__ and store in 'fov_index' parameter so we have a record of what fov and Z values were used for optimization if 'fov_index' was not specified. Change 'fov_index' to require a both fov and z value for every element. --- docs/tasks.rst | 3 ++- merlin/analysis/optimize.py | 36 ++++++++++++++---------------------- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/docs/tasks.rst b/docs/tasks.rst index 66a246b0..918ebd62 100755 --- a/docs/tasks.rst +++ b/docs/tasks.rst @@ -56,7 +56,8 @@ Description: Determines the optimal per-bit scale factors for barcode decoding. Parameters: * iteration\_count -- The number of iterations to perform for the optimization. -* fov\_per\_iteration -- The number of fields of view to decode in each round of optimization. +* fov\_index -- (Optional) A list of [[fov_1, z_value_1], [fov_2, z_value_2], ..] specifying which fields of view and what z values should be used for optimization. +* fov\_per\_iteration -- The number of fields of view to decode in each round of optimization. This will be set to the length of ``fov_index`` if the ``fov_index`` is specified. * estimate\_initial\_scale\_factors\_from\_cdf -- Flag indicating if the initial scale factors should be estimated from the pixel intensity cdf. If false, the initial scale factors are all set to 1. If true, the initial scale factors are based on the 90th percentile of the pixe intensity cdf. * area\_threshold -- The minimum barcode area for barcodes to be used in the calculation of the scale factors. diff --git a/merlin/analysis/optimize.py b/merlin/analysis/optimize.py index 00f73084..32202025 100755 --- a/merlin/analysis/optimize.py +++ b/merlin/analysis/optimize.py @@ -32,12 +32,22 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self.parameters['optimize_chromatic_correction'] = False if 'crop_width' not in self.parameters: self.parameters['crop_width'] = 0 + if 'fov_index' in self.parameters: - # Set iterations number to length of fov_index if not - # None. Should we warn the user? - # + logger = self.dataSet.get_logger(self) + logger.info('Setting fov_per_iteration to length of fov_index') + self.parameters['fov_per_iteration'] = \ len(self.parameters['fov_index']) + + else: + self.parameters['fov_index'] = [] + for i in range(self.parameters['fov_per_iteration']): + fovIndex = int(np.random.choice( + list(self.dataSet.get_fovs()))) + zIndex = int(np.random.choice( + list(range(len(self.dataSet.get_z_positions()))))) + self.parameters['fov_index'].append([fovIndex, zIndex]) def get_estimated_memory(self): return 4000 @@ -65,25 +75,7 @@ def _run_analysis(self, fragmentIndex): self.parameters['preprocess_task']) codebook = self.get_codebook() - # Use random FOV and Z index if the user did not specify. - if 'fov_index' not in self.parameters: - fovIndex = np.random.choice(list(self.dataSet.get_fovs())) - zIndex = np.random.choice( - list(range(len(self.dataSet.get_z_positions())))) - - # Otherwise use FOV and Z indices specified by the user. - else: - tmp = self.parameters['fov_index'][fragmentIndex] - - # Two element list is FOV, z index. - if isinstance(tmp, list): - fovIndex = tmp[0] - zIndex = tmp[1] - - # Otherwise this is the FOV index. - else: - fovIndex = tmp - zIndex = 0 + fovIndex, zIndex = self.parameters['fov_index'][fragmentIndex] scaleFactors = self._get_previous_scale_factors() backgrounds = self._get_previous_backgrounds() From 64532d257633543848216c5116b6601e1d915b70 Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Wed, 15 Apr 2020 12:26:17 -0400 Subject: [PATCH 54/66] Remove whitespace. --- merlin/analysis/optimize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/merlin/analysis/optimize.py b/merlin/analysis/optimize.py index 32202025..f182ab2b 100755 --- a/merlin/analysis/optimize.py +++ b/merlin/analysis/optimize.py @@ -32,14 +32,14 @@ def __init__(self, dataSet, parameters=None, analysisName=None): self.parameters['optimize_chromatic_correction'] = False if 'crop_width' not in self.parameters: self.parameters['crop_width'] = 0 - + if 'fov_index' in self.parameters: logger = self.dataSet.get_logger(self) logger.info('Setting fov_per_iteration to length of fov_index') self.parameters['fov_per_iteration'] = \ len(self.parameters['fov_index']) - + else: self.parameters['fov_index'] = [] for i in range(self.parameters['fov_per_iteration']): From cce491a2d89be79bf8094c54c4e7ddf5fe379182 Mon Sep 17 00:00:00 2001 From: Hazen Babcock Date: Wed, 15 Apr 2020 12:27:15 -0400 Subject: [PATCH 55/66] Text tweak. --- docs/tasks.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.rst b/docs/tasks.rst index 918ebd62..96dae679 100755 --- a/docs/tasks.rst +++ b/docs/tasks.rst @@ -57,7 +57,7 @@ Parameters: * iteration\_count -- The number of iterations to perform for the optimization. * fov\_index -- (Optional) A list of [[fov_1, z_value_1], [fov_2, z_value_2], ..] specifying which fields of view and what z values should be used for optimization. -* fov\_per\_iteration -- The number of fields of view to decode in each round of optimization. This will be set to the length of ``fov_index`` if the ``fov_index`` is specified. +* fov\_per\_iteration -- The number of fields of view to decode in each round of optimization. This will be set to the length of ``fov_index`` if the ``fov_index`` parameter is specified. * estimate\_initial\_scale\_factors\_from\_cdf -- Flag indicating if the initial scale factors should be estimated from the pixel intensity cdf. If false, the initial scale factors are all set to 1. If true, the initial scale factors are based on the 90th percentile of the pixe intensity cdf. * area\_threshold -- The minimum barcode area for barcodes to be used in the calculation of the scale factors. From 0e734e07deb2f2f68b5ae253e90c2bda4e630c3d Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 15 Apr 2020 14:45:47 -0400 Subject: [PATCH 56/66] moving z plane duplicate removal to decode step --- merlin/analysis/decode.py | 26 +++++++++++++++++ merlin/analysis/filterbarcodes.py | 46 +++++++------------------------ 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/merlin/analysis/decode.py b/merlin/analysis/decode.py index 6cd1b1ce..4e610b61 100755 --- a/merlin/analysis/decode.py +++ b/merlin/analysis/decode.py @@ -8,6 +8,7 @@ from merlin.util import decoding from merlin.util import barcodedb from merlin.data.codebook import Codebook +from merlin.util import barcodefilters class BarcodeSavingParallelAnalysisTask(analysistask.ParallelAnalysisTask): @@ -20,6 +21,14 @@ def __init__(self, dataSet: dataset.DataSet, parameters=None, analysisName=None): super().__init__(dataSet, parameters, analysisName) + if 'remove_z_duplicated_barcodes' not in self.parameters: + self.parameters['remove_z_duplicated_barcodes'] = False + if self.parameters['remove_z_duplicated_barcodes']: + if 'z_duplicate_zPlane_threshold' not in self.parameters: + self.parameters['z_duplicate_zPlane_threshold'] = 1 + if 'z_duplicate_xy_pixel_threshold' not in self.parameters: + self.parameters['z_duplicate_xy_pixel_threshold'] = np.sqrt(2) + def _reset_analysis(self, fragmentIndex: int = None) -> None: super()._reset_analysis(fragmentIndex) self.get_barcode_database().empty_database(fragmentIndex) @@ -31,6 +40,15 @@ def get_barcode_database(self) -> barcodedb.BarcodeDB: """ return barcodedb.PyTablesBarcodeDB(self.dataSet, self) + def remove_z_duplicate_barcodes(self, bc): + bc = barcodefilters.remove_zplane_duplicates_all_barcodeids( + bc, self.parameters['z_duplicate_zPlane_threshold'], + self.parameters['z_duplicate_xy_pixel_threshold'], + self.dataSet.get_z_positions()) + return bc + + + class Decode(BarcodeSavingParallelAnalysisTask): @@ -159,6 +177,14 @@ def _run_analysis(self, fragmentIndex): fragmentIndex, zPositionCount, decodedImages, magnitudeImages, distances) + if self.parameters['remove_z_duplicated_barcodes']: + bcDB = self.get_barcode_database() + bc = self.remove_z_duplicate_barcodes( + bcDB.get_barcodes(fov=fragmentIndex)) + bcDB.empty_database(fragmentIndex) + bcDB.write_barcodes(bc, fov=fragmentIndex) + + def _process_independent_z_slice( self, fov: int, zIndex: int, chromaticCorrector, scaleFactors, backgrounds, preprocessTask, decoder): diff --git a/merlin/analysis/filterbarcodes.py b/merlin/analysis/filterbarcodes.py index ca0c315d..a653d80f 100755 --- a/merlin/analysis/filterbarcodes.py +++ b/merlin/analysis/filterbarcodes.py @@ -4,7 +4,6 @@ from merlin.core import analysistask from merlin.analysis import decode -from merlin.util import barcodefilters class AbstractFilterBarcodes(decode.BarcodeSavingParallelAnalysisTask): @@ -15,21 +14,10 @@ class AbstractFilterBarcodes(decode.BarcodeSavingParallelAnalysisTask): def __init__(self, dataSet, parameters=None, analysisName=None): super().__init__(dataSet, parameters, analysisName) - if 'remove_z_duplicated_barcodes' not in self.parameters: - self.parameters['remove_z_duplicated_barcodes'] = False - if self.parameters['remove_z_duplicated_barcodes']: - if 'z_duplicate_zPlane_threshold' not in self.parameters: - self.parameters['z_duplicate_zPlane_threshold'] = 1 - if 'z_duplicate_xy_pixel_threshold' not in self.parameters: - self.parameters['z_duplicate_xy_pixel_threshold'] = np.sqrt(2) - - def remove_z_duplicate_barcodes(self, bc): - if self.parameters['remove_z_duplicated_barcodes']: - bc = barcodefilters.remove_zplane_duplicates_all_barcodeids( - bc, self.parameters['z_duplicate_zPlane_threshold'], - self.parameters['z_duplicate_xy_pixel_threshold'], - self.dataSet.get_z_positions()) - return bc + def get_codebook(self): + decodeTask = self.dataSet.load_analysis_task( + self.parameters['decode_task']) + return decodeTask.get_codebook() class FilterBarcodes(AbstractFilterBarcodes): @@ -61,11 +49,6 @@ def get_estimated_time(self): def get_dependencies(self): return [self.parameters['decode_task']] - def get_codebook(self): - decodeTask = self.dataSet.load_analysis_task( - self.parameters['decode_task']) - return decodeTask.get_codebook() - def _run_analysis(self, fragmentIndex): decodeTask = self.dataSet.load_analysis_task( self.parameters['decode_task']) @@ -73,12 +56,11 @@ def _run_analysis(self, fragmentIndex): intensityThreshold = self.parameters['intensity_threshold'] distanceThreshold = self.parameters['distance_threshold'] barcodeDB = self.get_barcode_database() - currentBC = decodeTask.get_barcode_database() \ - .get_filtered_barcodes(areaThreshold, intensityThreshold, - distanceThreshold=distanceThreshold, - fov=fragmentIndex) barcodeDB.write_barcodes( - self.remove_z_duplicate_barcodes(currentBC), fov=fragmentIndex) + decodeTask.get_barcode_database().get_filtered_barcodes( + areaThreshold, intensityThreshold, + distanceThreshold=distanceThreshold, fov=fragmentIndex), + fov=fragmentIndex) class GenerateAdaptiveThreshold(analysistask.AnalysisTask): @@ -369,11 +351,6 @@ def get_dependencies(self): return [self.parameters['adaptive_task'], self.parameters['decode_task']] - def get_codebook(self): - decodeTask = self.dataSet.load_analysis_task( - self.parameters['decode_task']) - return decodeTask.get_codebook() - def get_adaptive_thresholds(self): """ Get the adaptive thresholds used for filtering barcodes. @@ -396,8 +373,5 @@ def _run_analysis(self, fragmentIndex): currentBarcodes = decodeTask.get_barcode_database()\ .get_barcodes(fragmentIndex) - currentBarcodes = adaptiveTask.extract_barcodes_with_threshold( - threshold, currentBarcodes) - bcDatabase.write_barcodes( - self.remove_z_duplicate_barcodes(currentBarcodes), - fov=fragmentIndex) + bcDatabase.write_barcodes(adaptiveTask.extract_barcodes_with_threshold( + threshold, currentBarcodes), fov=fragmentIndex) From 74e08b8665c5d0b489dcbea99a10420e47bc1b70 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 15 Apr 2020 14:46:10 -0400 Subject: [PATCH 57/66] updating docs and changelog --- CHANGELOG.md | 2 +- docs/tasks.rst | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38171e23..740cea7e 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,4 +40,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [0.1.6] - ### Fixed -- Fixed bug and edge cases in removal of barcodes duplicated across z planes +- Fixed bug and edge cases in removal of barcodes duplicated across z planes. Moved to the decode step to prevent unintended conflict with misidentification rate determination. diff --git a/docs/tasks.rst b/docs/tasks.rst index fcd0967f..3d5f29d9 100755 --- a/docs/tasks.rst +++ b/docs/tasks.rst @@ -58,6 +58,9 @@ Parameters: * write_decoded\_images -- Flag indicating if the decoded and intensity images should be written. * minimum\_area -- The area threshold, below which decoded barcodes are ignored. * lowpass\_sigma -- The standard deviation for the low pass filter prior to decoding. +* remove\_z\_duplicated\_barcodes -- Remove putative duplicate barcode counts from adjacent z planes. +* z\_duplicate\_zPlane\_threshold -- If removing putative duplicate barcodes, number of adjacent z planes to consider, generally anything within 2 µm would be worth considering. +* z\_duplicate\_xy\_pixel\_threshold -- If removing putative duplicate barcodes, maximum euclidean distance in xy pixels that can separate the centroids of putative duplicates. filterbarcodes.FilterBarcodes ------------------------------ @@ -68,9 +71,6 @@ Parameters: * area\_threshold -- Barcodes with areas below the specified area\_threshold are removed. * intensity\_threshold -- Barcodes with intensities below this threshold are removed. -* remove\_z\_duplicated\_barcodes -- Remove putative duplicate barcode counts from adjacent z planes. -* z\_duplicate\_zPlane\_threshold -- If removing putative duplicate barcodes, number of adjacent z planes to consider, generally anything within 2 µm would be worth considering. -* z\_duplicate\_xy\_pixel\_threshold -- If removing putative duplicate barcodes, maximum euclidean distance in xy pixels that can separate the centroids of putative duplicates. filterbarcodes.GenerateAdaptiveThreshold ------------------------------------------- @@ -90,9 +90,6 @@ Description: Use an adaptive barcode to enrich the decode barcodes for the corre Parameters: * misidentification_rate -- The target misidentification rate, calculated as the number of blank barcodes per blank barcode divided by the number of coding barcodes per coding barcode. -* remove\_z\_duplicated\_barcodes -- Remove putative duplicate barcode counts from adjacent z planes. -* z\_duplicate\_zPlane\_threshold -- If removing putative duplicate barcodes, number of adjacent z planes to consider, generally anything within 2 µm would be worth considering. -* z\_duplicate\_xy\_pixel\_threshold -- If removing putative duplicate barcodes, maximum euclidean distance in xy pixels that can separate the centroids of putative duplicates. segment.SegmentCells ---------------------- From 9c2bfd630d0cdb9d0bafd8fdc531e9f63c916130 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 15 Apr 2020 14:46:59 -0400 Subject: [PATCH 58/66] adding test for case where no barcodes are present but get passed to z plane filtering --- test/test_zplane_duplicate_removal.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/test_zplane_duplicate_removal.py b/test/test_zplane_duplicate_removal.py index 5bfcd8e7..478c09c8 100644 --- a/test/test_zplane_duplicate_removal.py +++ b/test/test_zplane_duplicate_removal.py @@ -108,3 +108,15 @@ def test_farther_xyrange(): assert ex in keptBC['barcode'].values for notEx in notExpected: assert notEx not in keptBC['barcode'].values + +def test_empty_barcodes(): + zplane_cutoff = 1 + xy_cutoff = np.sqrt(2) + zpositions = [0, 1.5, 3, 4.5, 6, 7.5, 9] + + bcDF = pd.DataFrame([b1]) + bcDF.drop(0, inplace=True) + + keptBC = barcodefilters.remove_zplane_duplicates_all_barcodeids( + bcDF, zplane_cutoff, xy_cutoff, zpositions) + assert type(keptBC) == pd.DataFrame From 7f2551c9f6fedc972c1aae4f04e894364555528b Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 15 Apr 2020 14:52:55 -0400 Subject: [PATCH 59/66] pep8 --- test/test_zplane_duplicate_removal.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_zplane_duplicate_removal.py b/test/test_zplane_duplicate_removal.py index 478c09c8..91f4f5fa 100644 --- a/test/test_zplane_duplicate_removal.py +++ b/test/test_zplane_duplicate_removal.py @@ -109,6 +109,7 @@ def test_farther_xyrange(): for notEx in notExpected: assert notEx not in keptBC['barcode'].values + def test_empty_barcodes(): zplane_cutoff = 1 xy_cutoff = np.sqrt(2) From b907c6801f68efd938cfa059d312664ded6653df Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 15 Apr 2020 14:53:13 -0400 Subject: [PATCH 60/66] updating tests analysis parameters --- .../auxiliary_files/test_analysis_parameters.json | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/test/auxiliary_files/test_analysis_parameters.json b/test/auxiliary_files/test_analysis_parameters.json index dd9d553f..84cbd4e7 100755 --- a/test/auxiliary_files/test_analysis_parameters.json +++ b/test/auxiliary_files/test_analysis_parameters.json @@ -46,7 +46,10 @@ "preprocess_task": "DeconvolutionPreprocess", "optimize_task": "Optimize2", "global_align_task": "SimpleGlobalAlignment", - "crop_width": 10 + "crop_width": 10, + "remove_zplane_duplicates": true, + "z_planes_above_below": 1, + "max_centroid_separation": 1.414 } }, { @@ -67,10 +70,7 @@ "parameters": { "decode_task": "Decode", "area_threshold": 5, - "intensity_threshold": 1, - "remove_zplane_duplicates": true, - "z_planes_above_below": 1, - "max_centroid_separation": 1.414 + "intensity_threshold": 1 } }, { @@ -86,10 +86,7 @@ "module": "merlin.analysis.filterbarcodes", "parameters": { "decode_task": "Decode", - "adaptive_task": "GenerateAdaptiveThreshold", - "remove_zplane_duplicates": true, - "z_planes_above_below": 1, - "max_centroid_separation": 1.414 + "adaptive_task": "GenerateAdaptiveThreshold" } }, { From fd627bd236b37dcc84c32efd22e42be72d2d2409 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Wed, 15 Apr 2020 15:00:38 -0400 Subject: [PATCH 61/66] updating test analysis parameters --- test/auxiliary_files/test_analysis_parameters.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/auxiliary_files/test_analysis_parameters.json b/test/auxiliary_files/test_analysis_parameters.json index 84cbd4e7..2e9f9212 100755 --- a/test/auxiliary_files/test_analysis_parameters.json +++ b/test/auxiliary_files/test_analysis_parameters.json @@ -47,9 +47,9 @@ "optimize_task": "Optimize2", "global_align_task": "SimpleGlobalAlignment", "crop_width": 10, - "remove_zplane_duplicates": true, - "z_planes_above_below": 1, - "max_centroid_separation": 1.414 + "remove_z_duplicated_barcodes": true, + "z_duplicate_zPlane_threshold": 1, + "z_duplicate_xy_pixel_threshold": 1.414 } }, { From 85af00be6c787a5d69f4f7219dad1611fc190164 Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Fri, 17 Apr 2020 14:45:15 -0400 Subject: [PATCH 62/66] moving paraams and function from barcodesaving task to decode --- merlin/analysis/decode.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/merlin/analysis/decode.py b/merlin/analysis/decode.py index 4e610b61..2ae7e98f 100755 --- a/merlin/analysis/decode.py +++ b/merlin/analysis/decode.py @@ -21,14 +21,6 @@ def __init__(self, dataSet: dataset.DataSet, parameters=None, analysisName=None): super().__init__(dataSet, parameters, analysisName) - if 'remove_z_duplicated_barcodes' not in self.parameters: - self.parameters['remove_z_duplicated_barcodes'] = False - if self.parameters['remove_z_duplicated_barcodes']: - if 'z_duplicate_zPlane_threshold' not in self.parameters: - self.parameters['z_duplicate_zPlane_threshold'] = 1 - if 'z_duplicate_xy_pixel_threshold' not in self.parameters: - self.parameters['z_duplicate_xy_pixel_threshold'] = np.sqrt(2) - def _reset_analysis(self, fragmentIndex: int = None) -> None: super()._reset_analysis(fragmentIndex) self.get_barcode_database().empty_database(fragmentIndex) @@ -40,15 +32,6 @@ def get_barcode_database(self) -> barcodedb.BarcodeDB: """ return barcodedb.PyTablesBarcodeDB(self.dataSet, self) - def remove_z_duplicate_barcodes(self, bc): - bc = barcodefilters.remove_zplane_duplicates_all_barcodeids( - bc, self.parameters['z_duplicate_zPlane_threshold'], - self.parameters['z_duplicate_xy_pixel_threshold'], - self.dataSet.get_z_positions()) - return bc - - - class Decode(BarcodeSavingParallelAnalysisTask): @@ -74,6 +57,13 @@ def __init__(self, dataSet: dataset.MERFISHDataSet, self.parameters['decode_3d'] = False if 'memory_map' not in self.parameters: self.parameters['memory_map'] = False + if 'remove_z_duplicated_barcodes' not in self.parameters: + self.parameters['remove_z_duplicated_barcodes'] = False + if self.parameters['remove_z_duplicated_barcodes']: + if 'z_duplicate_zPlane_threshold' not in self.parameters: + self.parameters['z_duplicate_zPlane_threshold'] = 1 + if 'z_duplicate_xy_pixel_threshold' not in self.parameters: + self.parameters['z_duplicate_xy_pixel_threshold'] = np.sqrt(2) self.cropWidth = self.parameters['crop_width'] self.imageSize = dataSet.get_image_dimensions() @@ -179,7 +169,7 @@ def _run_analysis(self, fragmentIndex): if self.parameters['remove_z_duplicated_barcodes']: bcDB = self.get_barcode_database() - bc = self.remove_z_duplicate_barcodes( + bc = self._remove_z_duplicate_barcodes( bcDB.get_barcodes(fov=fragmentIndex)) bcDB.empty_database(fragmentIndex) bcDB.write_barcodes(bc, fov=fragmentIndex) @@ -238,3 +228,10 @@ def _extract_and_save_barcodes( self.cropWidth, zIndex, globalTask, minimumArea) for i in range(self.get_codebook().get_barcode_count())]), fov=fov) + + def _remove_z_duplicate_barcodes(self, bc): + bc = barcodefilters.remove_zplane_duplicates_all_barcodeids( + bc, self.parameters['z_duplicate_zPlane_threshold'], + self.parameters['z_duplicate_xy_pixel_threshold'], + self.dataSet.get_z_positions()) + return bc From 1436fc434aee7abe4fd34fac80f140ef4004ab4c Mon Sep 17 00:00:00 2001 From: Stephen Eichhorn Date: Mon, 20 Apr 2020 14:56:32 -0400 Subject: [PATCH 63/66] updating license --- license.md | 150 ++++++++--------------------------------------------- 1 file changed, 21 insertions(+), 129 deletions(-) diff --git a/license.md b/license.md index cae68616..5d83d798 100644 --- a/license.md +++ b/license.md @@ -1,129 +1,21 @@ -**Academic and/or Non-Commercial Research Use Software License and Terms of -Use** - -This Agreement concerns MERLIN (the “Software”), useful for analysis of images -obtained using MERFISH. The Software was developed by George Emanuel, Stephen -Eichhorn, and Xiaowei Zhuang at Harvard University. - -Using the Software indicates your agreement to be bound by the terms of this -Software Use Agreement (“Agreement”). Absent your agreement to the terms below, -you (the “End User”) have no rights to hold or use the Software whatsoever. - -President and Fellows of Harvard College (“Harvard”) agrees to grant hereunder -the limited non-exclusive license to End User for the use of the Software in the -performance of End User’s internal, non-commercial research and/or academic use -at End User’s institution or company (“Institution”) on the following terms and -conditions: - -1. **NO REDISTRIBUTION.** The Software remains the property of Harvard, and - except as set forth in Section 4, End User shall not publish, distribute, or - otherwise transfer or make available the Software to any other party. - -2. **NO COMMERCIAL USE.** End User shall not use the Software for commercial - purposes and any such use of the Software is expressly prohibited. This - prohibition includes, but is not limited to, use of the Software in - fee-for-service arrangements or to provide research services to (or in - collaboration with) third parties for a fee. This prohibition does not - extend to use for internal research purposes within a for-profit entity. If - End User wishes to use the Software for commercial purposes prohibited - herein, or for any other restricted purpose, End User must execute a - separate license agreement with Harvard. - - > *Requests for use of the Software for or on behalf of for-profit entities or - > for any commercial purposes, please contact*: - > - > Office of Technology Development - Harvard University - Smith Campus Center, Suite 727E - 1350 Massachusetts Avenue - Cambridge, MA 02138 USA - Telephone: (617) 495-3067 - E-mail: otd\@harvard.edu - -3. **OWNERSHIP AND COPYRIGHT NOTICE.** Harvard owns all intellectual property - in the Software. End User shall gain no ownership to the Software. End User - shall not remove or delete and shall retain in the Software, in any - modifications to Software and in any Derivative Works, the copyright, - trademark, or other notices pertaining to Software as provided with the - Software. - -4. **DERIVATIVE WORKS.** End User may create and use Derivative Works, as such - term is defined under U.S. copyright laws, provided that any such Derivative - Works shall be restricted to non-commercial, internal research and/or - academic use at End User’s Institution. End User may distribute Derivative - Works to other institutions solely for the performance of non-commercial, - internal research and/or academic use on terms substantially similar to this - License and Terms of Use. - -5. **FEEDBACK.** In order to improve the Software, comments from End Users may - be useful. End User agrees to provide Harvard with feedback on the End - User’s use of the Software (e.g., any bugs in the Software, the user - experience, etc.). Harvard is permitted to use such information provided by - End User in making changes and improvements to the Software without - compensation or an accounting to End User. - -6. **NON ASSERT.** End User acknowledges that Harvard may develop modifications - to the Software that may be based on the feedback provided by End User under - Section 5 above. Harvard shall not be restricted in any way by End User - regarding the use of such information. End User acknowledges the right of - Harvard to prepare, publish, display, reproduce, transmit and or use - modifications to the Software that may be substantially similar or - functionally equivalent to End User’s modifications and/or improvements if - any. In the event that End User obtains patent protection for any - modification or improvement to Software, End User agrees not to allege or - enjoin infringement of End User’s patent against Harvard or any of its - researchers, medical or research staff, officers, directors and employees. - -7. **PUBLICATION & ATTRIBUTION.** End User has the right to publish, present, - or share results from the use of the Software.  In accordance with customary - academic practice, End User will acknowledge Xiaowei Zhuang Laboratory at - Harvard as the provider of the Software. - -8. **NO WARRANTIES.** THE SOFTWARE IS PROVIDED "AS IS." TO THE FULLEST EXTENT - PERMITTED BY LAW, HARVARD HEREBY DISCLAIMS ALL WARRANTIES OF ANY KIND - (EXPRESS, IMPLIED OR OTHERWISE) REGARDING THE SOFTWARE, INCLUDING BUT NOT - LIMITED TO ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A - PARTICULAR PURPOSE, OWNERSHIP, AND NON-INFRINGEMENT. HARVARD MAKES NO - WARRANTY ABOUT THE ACCURACY, RELIABILITY, COMPLETENESS, TIMELINESS, - SUFFICIENCY OR QUALITY OF THE SOFTWARE. HARVARD DOES NOT WARRANT THAT THE - SOFTWARE WILL OPERATE WITHOUT ERROR OR INTERRUPTION. - -9. **Limitations of Liability and Remedies**. USE OF THE SOFTWARE IS AT END - USER’S OWN RISK. IF END USER IS DISSATISFIED WITH THE SOFTWARE, ITS - EXCLUSIVE REMEDY IS TO STOP USING IT. IN NO EVENT SHALL HARVARD BE LIABLE TO - END USER OR ITS INSTITUTION, IN CONTRACT, TORT OR OTHERWISE, FOR ANY DIRECT, - INDIRECT, SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR OTHER DAMAGES OF - ANY KIND WHATSOEVER ARISING OUT OF OR IN CONNECTION WITH THE SOFTWARE, EVEN - IF HARVARD IS NEGLIGENT OR OTHERWISE AT FAULT, AND REGARDLESS OF WHETHER - HARVARD IS ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. - -10. **INDEMNIFICATION.** To the extent permitted by law, End User shall - indemnify, defend and hold harmless Harvard and its current or future - directors, trustees, officers, faculty, medical and professional staff, - employees, students and agents and their respective successors, heirs and - assigns (the "Indemnitees"), against any liability, damage, loss or expense - (including reasonable attorney's fees and expenses of litigation) incurred - by or imposed upon the Indemnitees or any one of them in connection with any - claims, suits, actions, demands or judgments arising from End User’s breach - of this Agreement or its Institution’s use of the Software except to the - extent caused by the gross negligence or willful misconduct of Harvard. This - indemnification provision shall survive expiration or termination of this - Agreement. - -11. **GOVERNING LAW.** This Agreement shall be construed and governed by the - laws of the Commonwealth of Massachusetts regardless of otherwise applicable - choice of law standards. - -12. **NON-USE OF NAME.** Nothing in this License and Terms of Use shall be - construed as granting End Users or their Institutions any rights or licenses - to use any trademarks, service marks or logos associated with the Software. - You may not use the terms “Harvard” (or a substantially similar term) in any - way that is inconsistent with the permitted uses described herein. You agree - not to use any name or emblem of Harvard or any of its subdivisions for any - purpose, or to falsely suggest any relationship between End User (or its - Institution) and Harvard, or in any manner that would infringe or violate - any of Harvard’s rights. - -13. End User represents and warrants that it has the legal authority to enter - into this License and Terms of Use on behalf of itself and its Institution. - +The MIT License + +Copyright (c) 2019 Harvard University + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. From 616d8e9531aa5fe3825c722db25a6e43386f6c52 Mon Sep 17 00:00:00 2001 From: seichhorn Date: Mon, 20 Apr 2020 16:25:07 -0400 Subject: [PATCH 64/66] adding citation --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 9f9ce6f0..352ba6e4 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,9 @@ single task or split among many subtasks that can be executed in parallel. MERli execute workflows on a single computer, on a high performance cluster, or on the cloud (AWS and Google Cloud). +If MERlin is useful for your research, consider citing: +Emanuel, G., Eichhorn, S. W., Zhuang, X. 2020, MERlin - scalable and extensible MERFISH analysis software, v0.1.6, Zenodo, doi:10.5281/zenodo.3758540 + Please find the most recent version of MERlin [here](https://github.com/emanuega/merlin). ## MERFISH data analysis From 29bda2cb184837f8e106a017d93d8d0b0a23ff3c Mon Sep 17 00:00:00 2001 From: seichhorn Date: Mon, 20 Apr 2020 18:16:21 -0400 Subject: [PATCH 65/66] adding doi badge --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 352ba6e4..693ff220 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ [![CircleCI](https://circleci.com/gh/emanuega/MERlin/tree/master.svg?style=svg)](https://circleci.com/gh/emanuega/MERlin/tree/master) [![codecov](https://codecov.io/gh/emanuega/MERlin/branch/master/graph/badge.svg)](https://codecov.io/gh/emanuega/MERlin) +[![DOI](https://zenodo.org/badge/202668055.svg)](https://zenodo.org/badge/latestdoi/202668055) # MERlin - Extensible pipeline for scalable data analysis From 5056b1c1322962e25d8acf1a2e8f9679fa90d31c Mon Sep 17 00:00:00 2001 From: Demian Lesik Date: Mon, 16 Nov 2020 14:03:06 +0300 Subject: [PATCH 66/66] Change .format to % in dataset.py --- merlin/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/merlin/core/dataset.py b/merlin/core/dataset.py index bc120af6..897d246b 100755 --- a/merlin/core/dataset.py +++ b/merlin/core/dataset.py @@ -65,7 +65,7 @@ def __init__(self, dataDirectoryName: str, self.rawDataPortal = dataportal.DataPortal.create_portal( self.rawDataPath) if not self.rawDataPortal.is_available(): - print('The raw data is not available at %s'.format( + print(('The raw data is not available at %s') % ( self.rawDataPath)) self.analysisPath = os.sep.join([analysisHome, dataDirectoryName])