diff --git a/install_elastix_R.sh b/install_elastix_R.sh index e0403d28..8a292a62 100644 --- a/install_elastix_R.sh +++ b/install_elastix_R.sh @@ -8,9 +8,9 @@ sudo apt install python3-pip sudo apt install r-base # Install elastix -wget https://github.com/SuperElastix/elastix/releases/download/4.9.0/elastix-4.9.0-linux.tar.bz2 +wget https://github.com/SuperElastix/elastix/releases/download/5.0.0/elastix-5.0.0-linux.tar.bz2 mkdir ~/elastix -tar xjf elastix-4.9.0-linux.tar.bz2 -C ~/elastix +tar xjf elastix-5.0.0-linux.tar.bz2 -C ~/elastix home=~/ echo -e "\n# paths added by my LAMA installation" >> ~/.bashrc diff --git a/lama/__init__.py b/lama/__init__.py index 83cdd0a8..ef6522a4 100755 --- a/lama/__init__.py +++ b/lama/__init__.py @@ -5,4 +5,4 @@ warnings.filterwarnings("ignore") -matplotlib.use('Agg') +#matplotlib.use('Agg') diff --git a/lama/common.py b/lama/common.py index 75b735ff..a76eef7a 100755 --- a/lama/common.py +++ b/lama/common.py @@ -42,6 +42,7 @@ ORGAN_VOLUME_CSV_FILE = 'organ_volumes.csv' STAGING_INFO_FILENAME = 'staging_info_volume.csv' +FOLDING_FILE_NAME = 'folding_report.csv' lama_root_dir = Path(lama.__file__).parent @@ -116,7 +117,7 @@ def excepthook_overide(exctype, value, traceback): def command_line_agrs(): - return ', '.join(sys.argv) + return ' '.join(sys.argv) def touch(file_: Path): @@ -238,7 +239,7 @@ def write_array(array: np.ndarray, path: Union[str, Path], compressed=True, ras= img = sitk.GetImageFromArray(array) if ras: img.SetDirection((-1, 0, 0, 0, -1, 0, 0, 0, 1)) - sitk.WriteImage(sitk.GetImageFromArray(array), path, compressed) + sitk.WriteImage(img, path, compressed) def read_array( path: Union[str, Path]): @@ -373,7 +374,7 @@ def mkdir_if_not_exists(dir_: Union[str, Path]): def get_file_paths(folder: Union[str, Path], extension_tuple=('.nrrd', '.tiff', '.tif', '.nii', '.bmp', 'jpg', 'mnc', 'vtk', 'bin', 'npy'), - pattern: str = None, ignore_folder: str = "") -> Union[List[str], List[Path]]: + pattern: str = None, ignore_folders: Union[List, str] = []) -> Union[List[str], List[Path]]: """ Given a directory return all image paths within all sibdirectories. @@ -385,8 +386,8 @@ def get_file_paths(folder: Union[str, Path], extension_tuple=('.nrrd', '.tiff', Select only images with these extensions pattern Do a simple `pattern in filename` filter on filenames - ignore_folder - do not look in folder with this name + ignore_folders + do not look in folder with these names Notes ----- @@ -395,32 +396,33 @@ def get_file_paths(folder: Union[str, Path], extension_tuple=('.nrrd', '.tiff', Do not include hidden filenames """ + paths = [] - if not os.path.isdir(folder): - return False - else: - paths = [] + if isinstance(ignore_folders, str): + ignore_folders = [ignore_folders] - for root, subfolders, files in os.walk(folder): - if ignore_folder in subfolders: - subfolders.remove(ignore_folder) + for root, subfolders, files in os.walk(folder): - for filename in files: + for f in ignore_folders: + if f in subfolders: + subfolders.remove(f) - if filename.lower().endswith(extension_tuple) and not filename.startswith('.'): + for filename in files: - if pattern: + if filename.lower().endswith(extension_tuple) and not filename.startswith('.'): - if pattern and pattern not in filename: - continue + if pattern: - paths.append(os.path.abspath(os.path.join(root, filename))) + if pattern and pattern not in filename: + continue - if isinstance(folder, str): - return paths - else: - return [Path(x) for x in paths] + paths.append(os.path.abspath(os.path.join(root, filename))) + + if isinstance(folder, str): + return paths + else: + return [Path(x) for x in paths] def check_config_entry_path(dict_, key): @@ -466,6 +468,14 @@ def getfile_endswith(dir_: Path, suffix: str): raise FileNotFoundError(f'cannot find path file ending with {suffix} in {dir_}') from e +def getfile_startswith_endswith(dir_: Path, prefix: str, suffix: str): + try: + return [x for x in dir_.iterdir() if x.name.endswith(suffix) and x.name.startswith(prefix)][0] + except IndexError as e: + raise FileNotFoundError(f'cannot find path starting with {prefix} and ending with {suffix} in {dir_}') from e + + + def get_inputs_from_file_list(file_list_path, config_dir): """ Gte the input files @@ -956,3 +966,21 @@ def cfg_load(cfg) -> Dict: else: raise ValueError('Config file should end in .toml or .yaml') + + +def bytesToGb(numberOfBytes, precision = 3): + return round(numberOfBytes / (1024 ** 3), precision) + +def logMemoryUsageInfo(): + proc = psutil.Process(os.getpid()) + + procMemInfo = proc.memory_info() + globalMemInfo = psutil.virtual_memory() + + totalMem = bytesToGb(globalMemInfo.total, 5) + totalMemAvailable = bytesToGb(globalMemInfo.available, 5) + procMemRSS = bytesToGb(procMemInfo.rss, 5) + procMemVMS = bytesToGb(procMemInfo.vms, 5) + procMemData = bytesToGb(procMemInfo.data, 5) + + logging.info(f"Memory: Process Resident: {procMemRSS}, Process Virtual: {procMemVMS}, Process data: {procMemData}. Total Available: {totalMemAvailable}") diff --git a/lama/elastix/__init__.py b/lama/elastix/__init__.py index aea6cf6d..37997906 100644 --- a/lama/elastix/__init__.py +++ b/lama/elastix/__init__.py @@ -13,5 +13,6 @@ VOLUME_CALCULATIONS_FILENAME = "organvolumes.csv" INVERT_CONFIG = 'invert.yaml' REG_DIR_ORDER = 'reg_order.txt' -IGNORE_FOLDER = 'resolution_images' # When reading images from dir and subdirs, ignore images in this folder +RESOLUTION_IMGS_DIR = 'resolution_images' # When reading images from dir and subdirs, ignore images in this folder TRANSFORMIX_OUT = 'result.nrrd' # This will not be correct if filetype is not nrrd +IMG_PYRAMID_DIR = 'pyramid_images' \ No newline at end of file diff --git a/lama/elastix/deformations.py b/lama/elastix/deformations.py index 1d882cde..04f91ec1 100755 --- a/lama/elastix/deformations.py +++ b/lama/elastix/deformations.py @@ -22,18 +22,40 @@ TRANSFORMIX_LOG = 'transformix.log' -def make_deformations_at_different_scales(config: Union[LamaConfig, dict]): +def make_deformations_at_different_scales(config: Union[LamaConfig, dict]) -> Union[None, np.array]: """ - Generate jacobian determinants ans optionaly defromation vectors + Generate jacobian determinants and optionaly defromation vectors Parameters ---------- config: LamaConfig object if running from other lama module Path to config file if running this module independently + + Notes + ----- + How to generate the hacobian determinants is defined by the config['generate_deformation_fields'] entry. + + toml representation from the LAMA config: + + [generate_deformation_fields] + 192_to_10 = [ "deformable_192_to_10",] + + This will create a set of jacobian determinants and optional deformation fields called 192_to_10 and using the + the named regisrtation stages in the list. + + Multiple sets of key/value pairs are allowed so that diffrent jacobians determinatnts might be made. For eaxmple + you may want to include the affine transformation in the jacobians, which would look like so: + + affine_192_to_10 = [ "affine", "deformable_192_to_10"] + + Returns + ------- + jacobian array if there are any negative values """ - if isinstance(config, Path): - config = LamaConfig(config) + + if isinstance(config, (str, Path)): + config = LamaConfig(Path(config)) if not config['generate_deformation_fields']: return @@ -76,8 +98,9 @@ def make_deformations_at_different_scales(config: Union[LamaConfig, dict]): log_jacobians_scale_dir = log_jacobians_dir / deformation_id log_jacobians_scale_dir.mkdir() - generate_deformation_fields(reg_stage_dirs, resolutions, deformation_scale_dir, jacobians_scale_dir, + neg_jac_arr = generate_deformation_fields(reg_stage_dirs, resolutions, deformation_scale_dir, jacobians_scale_dir, log_jacobians_scale_dir, make_vectors, threads=config['threads'], filetype=config['filetype']) + return neg_jac_arr def generate_deformation_fields(registration_dirs: List, @@ -150,8 +173,9 @@ def generate_deformation_fields(registration_dirs: List, # Copy the tp files into the temp directory and then modify to add initail transform # pass in the last tp file [-1] as the other tp files are intyernally referenced - get_deformations(transform_params[-1], deformation_dir, jacobian_dir, log_jacobians_dir, filetype, specimen_id, + neg_jac_array = get_deformations(transform_params[-1], deformation_dir, jacobian_dir, log_jacobians_dir, filetype, specimen_id, threads, jacmat, get_vectors) + return neg_jac_array def _modfy_tforms(tforms: List): @@ -166,7 +190,7 @@ def _modfy_tforms(tforms: List): for i, tp in enumerate(tforms[1:]): initial_tp = tforms[i] - with open(tp, 'rb') as fh: + with open(tp, 'r') as fh: lines = [] for line in fh: if line.startswith('(InitialTransformParametersFileName'): @@ -175,7 +199,7 @@ def _modfy_tforms(tforms: List): previous_tp_str = '(InitialTransformParametersFileName "{}")'.format(initial_tp) lines.insert(0, previous_tp_str + '\n') - with open(tp, 'wb') as wh: + with open(tp, 'w') as wh: for line in lines: wh.write(line) @@ -188,9 +212,13 @@ def get_deformations(tform: Path, specimen_id: str, threads: int, make_jacmat: bool, - get_vectors: bool = False): + get_vectors: bool = False) -> Union[None, np.array]: """ Generate spatial jacobians and optionally deformation files. + + Returns + ------- + the jacobian array if there are any values < 0 """ cmd = ['transformix', @@ -265,5 +293,8 @@ def get_deformations(tform: Path, logging.info('Finished generating deformation fields') + if jac_min <=0: + return jac_arr + diff --git a/lama/elastix/elastix_registration.py b/lama/elastix/elastix_registration.py index 7d9cfe7a..1dd2e48a 100755 --- a/lama/elastix/elastix_registration.py +++ b/lama/elastix/elastix_registration.py @@ -1,19 +1,19 @@ -from lama import common from logzero import logger as logging -import sys -from os.path import join, isdir, splitext, basename, relpath, exists, abspath, dirname, realpath +from os.path import join, isdir, splitext, basename, relpath import subprocess import os import shutil from collections import defaultdict from pathlib import Path -from typing import Union import yaml import SimpleITK as sitk -REOLSUTION_TP_PREFIX = 'TransformParameters.0.R' +from lama.elastix.folding import unfold_bsplines +from lama import common +from lama.elastix import ELX_TRANSFORM_PREFIX, RESOLUTION_IMGS_DIR, IMG_PYRAMID_DIR + +RESOLUTION_TP_PREFIX = 'TransformParameters.0.R' FULL_STAGE_TP_FILENAME = 'TransformParameters.0.txt' -RESOLUTION_IMG_FOLDER = 'resolution_images' class ElastixRegistration(object): @@ -49,14 +49,15 @@ def __init__(self, elxparam_file: Path, self.fixed_mask = fixed_mask self.filetype = filetype self.threads = threads - # A subset of volumes from folder to register + self.rename_output = True # Bodge for pairwise reg, or we end up filling all the disks + def make_average(self, out_path): """ Create an average of the the input embryo volumes. This will search subfolders for all the registered volumes within them """ - vols = common.get_file_paths(self.stagedir, ignore_folder=RESOLUTION_IMG_FOLDER) + vols = common.get_file_paths(self.stagedir, ignore_folders=[RESOLUTION_IMGS_DIR, IMG_PYRAMID_DIR]) #logging.info("making average from following volumes\n {}".format('\n'.join(vols))) average = common.average(vols) @@ -68,6 +69,7 @@ class TargetBasedRegistration(ElastixRegistration): def __init__(self, *args): super(TargetBasedRegistration, self).__init__(*args) self.fixed = None + self.fix_folding = False def set_target(self, target): self.fixed = target @@ -77,7 +79,7 @@ def run(self): if self.movdir.is_file(): moving_imgs = [self.movdir] else: - moving_imgs = common.get_file_paths(self.movdir, ignore_folder=RESOLUTION_IMG_FOLDER) # This breaks if not ran from config dir + moving_imgs = common.get_file_paths(self.movdir, ignore_folders=[RESOLUTION_IMGS_DIR, IMG_PYRAMID_DIR]) # This breaks if not ran from config dir if len(moving_imgs) < 1: raise common.LamaDataException("No volumes in {}".format(self.movdir)) @@ -99,16 +101,17 @@ def run(self): run_elastix(cmd) # Rename the registered output. - elx_outfile = outdir / f'result.0.{self.filetype}' - new_out_name = outdir / f'{mov_basename}.{self.filetype}' + if self.rename_output: + elx_outfile = outdir / f'result.0.{self.filetype}' + new_out_name = outdir / f'{mov_basename}.{self.filetype}' - try: - shutil.move(elx_outfile, new_out_name) - except IOError: - logging.error('Cannot find elastix output. Ensure the following is not set: (WriteResultImage "false")') - raise + try: + shutil.move(elx_outfile, new_out_name) + except IOError: + logging.error('Cannot find elastix output. Ensure the following is not set: (WriteResultImage "false")') + raise - move_intemediate_volumes(outdir) + move_intemediate_volumes(outdir) # add registration metadata reg_metadata_path = outdir / common.INDV_REG_METADATA @@ -118,6 +121,23 @@ def run(self): with open(reg_metadata_path, 'w') as fh: fh.write(yaml.dump(reg_metadata, default_flow_style=False)) + if self.fix_folding: + # Remove any folds folds in the Bsplines, overwtite inplace + tform_param_file = outdir / ELX_TRANSFORM_PREFIX + unfold_bsplines(tform_param_file, tform_param_file) + + # Retransform the moving image with corrected tform file + cmd = [ + 'transformix', + '-in', str(mov), + '-out', str(outdir), + '-tp', tform_param_file + ] + subprocess.call(cmd) + unfolded_moving_img = outdir / 'result.nrrd' + new_out_name.unlink() + shutil.move(unfolded_moving_img, new_out_name) + class PairwiseBasedRegistration(ElastixRegistration): @@ -156,7 +176,7 @@ def run(self): 'threads': self.threads, 'fixed': fixed}) # Get the resolution tforms - tforms = list(sorted([x for x in os.listdir(outdir) if x .startswith(REOLSUTION_TP_PREFIX)])) + tforms = list(sorted([x for x in os.listdir(outdir) if x .startswith(RESOLUTION_TP_PREFIX)])) # get the full tform that spans all resolutions full_tp_file_paths.append(join(outdir, FULL_STAGE_TP_FILENAME)) @@ -173,7 +193,7 @@ def run(self): fh.write(yaml.dump(reg_metadata, default_flow_style=False)) for i, files_ in tp_file_paths.items(): - mean_tfom_name = "{}{}.txt".format(REOLSUTION_TP_PREFIX, i) + mean_tfom_name = "{}{}.txt".format(RESOLUTION_TP_PREFIX, i) self.generate_mean_tranform(files_, fixed, fixed_dir, mean_tfom_name, self.filetype) self.generate_mean_tranform(full_tp_file_paths, fixed, fixed_dir, FULL_STAGE_TP_FILENAME, self.filetype) @@ -233,7 +253,7 @@ def generate_mean_tranform(tp_files, fixed_vol, out_dir, tp_out_name, filetype): def run_elastix(args): cmd = ['elastix', '-f', args['fixed'], - '-m', args['mov'], + '-m', f'"{args["mov"]}"', '-out', args['outdir'], '-p', args['elxparam_file'], ] @@ -247,7 +267,6 @@ def run_elastix(args): try: a = subprocess.check_output(cmd) except Exception as e: # can't seem to log CalledProcessError - logging.exception('registration falied:\n\ncommand: {}\n\n error:{}'.format(cmd, e.output)) raise @@ -255,14 +274,26 @@ def run_elastix(args): def move_intemediate_volumes(reg_outdir: Path): """ If using elastix multi-resolution registration and outputting image each resolution, put the intermediate files - in a separate folder + in a separate folder. + + Do the same with pyramid images """ - imgs = common.get_file_paths(reg_outdir) - intermediate_imgs = [x for x in imgs if basename(x).startswith('result.')] + + intermediate_imgs = list(reg_outdir.rglob('*result.0.R*')) #[x for x in imgs if basename(x).startswith('result.')] if len(intermediate_imgs) > 0: - int_dir = join(reg_outdir, RESOLUTION_IMG_FOLDER) - common.mkdir_force(int_dir) + + reolution_img_dir = reg_outdir / RESOLUTION_IMGS_DIR + common.mkdir_force(reolution_img_dir) for int_img in intermediate_imgs: - shutil.move(str(int_img), str(int_dir)) - # convert_16_to_8.convert_16_bit_to_8bit(int_dir, int_dir) + shutil.move(str(int_img), str(reolution_img_dir)) + + pyramid_imgs = list(reg_outdir.rglob('*ImagePyramid*')) + if len(pyramid_imgs) > 0: + + img_pyramid_dir = reg_outdir / IMG_PYRAMID_DIR + common.mkdir_force(img_pyramid_dir) + for pyr_img in pyramid_imgs: + shutil.move(str(pyr_img), str(img_pyramid_dir)) + + diff --git a/lama/elastix/folding.py b/lama/elastix/folding.py new file mode 100644 index 00000000..ee43ac6f --- /dev/null +++ b/lama/elastix/folding.py @@ -0,0 +1,216 @@ +""" +Folding in the deformations cn casue problems with the inversions. +This module corrects overafolding and ensures injectivity of the transform +""" +import numpy as np +from pathlib import Path +from typing import Union + + +K2 = 2.046392675 +K3 = 2.479472335 +A2 = np.sqrt( + (3/2)**2 + (K2 - (3/2))**2 +) +A3 = np.sqrt( + ((3/2)**2 + (K2 - (3/2))**2 + (K3 - K2)**2) +) + + +class BSplineParse(): + + def __init__(self, tform_file: str): + """ + Read in the coefficients from an elastix tform parameter files + + Parameters + ---------- + tform_file + the path to the elastix TranformParameter file + + Attributes + ---------- + coefs: np.ndarray + m*n array. m number of control points, n = num axes + + Returns + ------- + + tuple + + 0: + if Bspline: + n*m np.array where n is number of control points and m is vector of coordinates (xyz) + if Euler, similarity, affine + 1d array of transormation parameters + 1: + list of non-transform parameter lines from the tform file + + """ + + with open(tform_file, 'r') as fh: + + other_data = [] + + bspline = False + + for line in fh: + if line.startswith('(TransformParameters '): + tform_str = line.strip() + tform_str = tform_str.strip(')') + tform_str = tform_str.lstrip('(TransformParameters ') + tform_params = tform_str.split(' ') + + tform_params= [float(x) for x in tform_params] + else: + other_data.append(line) + + if line.startswith('(Transform "BSplineTransform")'): + bspline = True + + if bspline: + tform_params = np.array(np.array_split(tform_params, 3)).T + + self.coefs = tform_params + self.elastix_params = other_data + + def control_point_coords(self): + + for line in self.elastix_params: + + if line.startswith('(Size '): + size = [int(x) for x in line.strip().split(' ')[1:]] + + elif line.startswith('GridOrigin'): + grid_origin = [int(x) for x in line.strip().split(' ')[1:]] + + elif line.startswith('GridSpacing'): + grid_spacing = [int(x) for x in line.strip().split(' ')[1:]] + + +def condition_1(coefs): + """ + numpy array of tform coefs of shape n_coefs * dims + + Returns + ------- + + """ + t = np.abs(coefs) < 1/K3 + condition_met = np.apply_along_axis(np.all, 1, t) + return condition_met + + +def condition_2(coefs): + # def f(x): + # # s = np.sum(x) < (1 / A3) ** 2 + # s = np.linalg.norm(x) < (1 / A3) + # return s + # t = coefs**2 + # condition_met = np.apply_along_axis(f, 1, t) + # return condition_met + return np.linalg.norm(coefs, axis=1) < (1 / A3) + + +# def correct_1(coefs, potential_fold_indices): +# for idx in potential_fold_indices: +# bounds = (1 / A3) ** 2 +# coefs[idx, :] = bounds +# return coefs + + +def correct(coefs): + coefs = np.copy(coefs) + + result = [] + for c in coefs: + + # Make this vector satisfy condition 2 + unfolded = np.array(c) / np.linalg.norm(c) * (1 / A3) + # unfolded = c * (max_bound / np.sum(c)) + + # Replace the vector of coefs + result.append(unfolded) + + return result + + +def unfold_bsplines(bs: Union[BSplineParse, str], outfile=None) -> np.ndarray: + + if isinstance(bs, (str, Path)): + bs = BSplineParse(bs) + + # Scale the coeficents by the grid size + spacing_str = bs.elastix_params[21].split(' ')[1:4] + grid_size = [float(x.strip().strip(')')) for x in spacing_str] + + # this conversion may be needed if bs.coefs doesn't accept numpy array operations + # scale to (1, 1)-spacing grid + + bs.coefs[:, 0] /= grid_size[0] + bs.coefs[:, 1] /= grid_size[1] + bs.coefs[:, 2] /= grid_size[2] + # grid points that need to be corrected + idx_to_correct = ~(condition_1(bs.coefs) | condition_2(bs.coefs)) + + if not np.any(idx_to_correct): # No folding so write or return orginal + if outfile: + write_tform(bs.coefs, bs.elastix_params, outfile) + return + else: + return bs + # correct potentially problematic grid points + bs.coefs[idx_to_correct, :] = correct(bs.coefs[idx_to_correct, :]) + # rescale to original grid spacing + bs.coefs[:, 0] *= grid_size[0] + bs.coefs[:, 1] *= grid_size[1] + bs.coefs[:, 2] *= grid_size[2] + + if outfile: + write_tform(bs.coefs, bs.elastix_params, outfile) + + return bs.coefs + + +def write_tform(tform_params, other_data, tform_file, init_tform=None): + """ + Write elastix transform file + + Parameters + ---------- + tform_params + The spline coefficient or tranform parameters + m*n array. m = num control points, n = dims of coefs (3) + other_data + Lines of the other data from the transform parameer file + + tform_file + Where to write the tform + + init_tform + + Returns + ------- + + """ + + with open(tform_file, 'w') as fh: + + for line in other_data: + + if init_tform and line.startswith('(InitialTransformParametersFileName'): + line = f'(InitialTransformParametersFileName "{init_tform}")\n' + + if line.startswith('(NumberOfParameters '): + num_params = int(line.strip().strip(')').split(' ')[1]) + + fh.write(line) + + if tform_params.size != num_params: + raise ValueError(f"stated num params: {num_params} does not match actual {tform_params.size}") + # write out all the x column first by using fortran mode. Set print options to prevent ... shortening + np.set_printoptions(threshold=np.prod(tform_params.shape), suppress=True) + + fh.write(f"(TransformParameters ") + tform_params.flatten(order='f').tofile(fh, sep=" ", format="%.6f") + fh.write(')') diff --git a/lama/elastix/invert_transforms.py b/lama/elastix/invert_transforms.py index f17e235f..27248f61 100644 --- a/lama/elastix/invert_transforms.py +++ b/lama/elastix/invert_transforms.py @@ -15,7 +15,7 @@ from lama.registration_pipeline.validate_config import LamaConfig from . import (ELX_TRANSFORM_PREFIX, ELX_PARAM_PREFIX, LABEL_INVERTED_TRANFORM, - IMAGE_INVERTED_TRANSFORM, INVERT_CONFIG, IGNORE_FOLDER) + IMAGE_INVERTED_TRANSFORM, INVERT_CONFIG, RESOLUTION_IMGS_DIR, IMG_PYRAMID_DIR) def batch_invert_transform_parameters(config: Union[str, LamaConfig], @@ -48,7 +48,7 @@ def batch_invert_transform_parameters(config: Union[str, LamaConfig], # Get the image basenames from the first stage registration folder (usually rigid) # ignore images in non-relevent folder that may be present - volume_names = [x.stem for x in common.get_file_paths(reg_dirs[0], ignore_folder=IGNORE_FOLDER)] + volume_names = [x.stem for x in common.get_file_paths(reg_dirs[0], ignore_folders=[RESOLUTION_IMGS_DIR, IMG_PYRAMID_DIR])] inv_outdir = config.mkdir('inverted_transforms') diff --git a/lama/img_processing/dicom_to_nrrd.sh b/lama/img_processing/dicom_to_nrrd.sh new file mode 100644 index 00000000..24245e5d --- /dev/null +++ b/lama/img_processing/dicom_to_nrrd.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +#Bash script to convert DICOM files to NRRD format in batches. +#Currenty converts DICOMs to 16-bit utype (and as such may need modifiying). + +#Dependencies: +# Slicer +# dcm2niix module for Slicer (path to executable module depends on the host) - you need to also install the SlicerDcm2nii extension within Slicer + + +#TODO: add genotype detection by identifying 'wt' in the parent directory +#(uCT scans are performed before genotyping to reduce bias and therefore +#a method of simplified labelling post scanning will be required) + +#TODO: double check headers are compatible with LAMA. + +#Make directory +mkdir nrrd_out + +#loops through folders +for directory in */; +do + #Go within the specific DICOM directory: + dir_name=${directory%/*// /_} + cd ${dir_name} + + #Error trap for spaces in dicom filenames from previous PhD student + for f in *\ *; do mv "$f" "${f// /_}"; done + + #Perform the conversion + #TODO: find code to identify where dcm2niix is located. + cd ../ + /home/minc/.config/NA-MIC/Extensions-28257/SlicerDcm2nii/lib/Slicer-4.10/qt-scripted-modules/Resources/bin/dcm2niix -1 -d 0 -f ${dir_name%/} -o nrrd_out -e y -z n ${dir_name} +done + + + + + + + + + + + + + + + + + + + + diff --git a/lama/paths.py b/lama/paths.py index d18fb24c..747d79fe 100755 --- a/lama/paths.py +++ b/lama/paths.py @@ -9,13 +9,14 @@ # TODO: Link up this code with where the folders are cerated during a LAMA run. Then when changes to folder names occur +# TODO: raise error when a nonlama folder is suplied # they are replfected in this iterator def specimen_iterator(reg_out_dir: Path) -> Iterator[Tuple[Path, Path]]: """ Given a registration output root folder , iterate over the speciemns of each line in the subfolders - Note: lama considers the basliene as a single line. + Note: lama considers the baseliene as a single line. Parameters ---------- @@ -29,7 +30,28 @@ def specimen_iterator(reg_out_dir: Path) -> Iterator[Tuple[Path, Path]]: The path to the line directory The path to the specimen directory + Eaxmple folder structure showing which folder to use + ---------------------------------------------------- + ├── baseline + │   └── output # This folder can be used as reg_out_dir + │   ├── baseline + │   └── staging_info_volume.csv + └── mutants + └── output # This folder can be used as reg_out_dir + ├── Ascc1 + ├── Bbox1 + ├── Copb2 + ├── Shmt2 + ├── staging_info_volume.csv + ├── Synrg + ├── Tm2d1 + ├── Tm2d2 + ├── Vars2 + └── Vps37d + + """ + reg_out_dir = Path(reg_out_dir) if not reg_out_dir.is_dir(): raise FileNotFoundError(f'Cannot find output directory {reg_out_dir}') @@ -77,6 +99,7 @@ def setup(self): self.inverted_labels_dirs = self.get_multistage_data(self.outroot / 'inverted_labels') self.qc = self.specimen_root / 'output' / 'qc' self.qc_red_cyan_dirs = self.qc / 'red_cyan_overlays' + self.qc_inverted_labels = self.qc / 'inverted_label_overlay' self.qc_grey_dirs = self.qc / 'greyscales' return self diff --git a/lama/qc/collate_qc.py b/lama/qc/collate_qc.py index fd39911b..32ef99db 100644 --- a/lama/qc/collate_qc.py +++ b/lama/qc/collate_qc.py @@ -1,6 +1,6 @@ """ -Upon completion of a a lama run, this scrpt will collate all the QC images into a single file format x to enable -the fast identification of issues +Upon completion of a a lama run, this scrpt will collate all the QC images into a single html to enable +the rapid identification of issues. """ from pathlib import Path @@ -19,15 +19,17 @@ from lama.paths import DataIterator, SpecimenDataPaths -def make_grid(root: Path, outdir, qc_type='red_cyan'): +def make_grid(root: Path, outdir, qc_type='red_cyan', height='auto'): """ Parameters ---------- root - A Lama registrtion root for a line (or the baselines) containing multiple + A Lama registrtion root for a line (or the baselines) containing multiple specimens outpath Where to put the final image. Filetype is from extension (can use .png and .pdf at least) + height: + the css height property for the img (Use 'auto' or px. Percentage sclaing messes things up """ # Create series of images specimen-based view @@ -36,9 +38,11 @@ def make_grid(root: Path, outdir, qc_type='red_cyan'): spec: SpecimenDataPaths - oris = [HtmlGrid('axial'), - HtmlGrid('coronal'), - HtmlGrid('sagittal')] + single_file = True + + oris = [HtmlGrid('axial', single_file=single_file), + HtmlGrid('coronal', single_file=single_file), + HtmlGrid('sagittal', single_file=single_file)] for i, spec in enumerate(d): @@ -52,36 +56,63 @@ def make_grid(root: Path, outdir, qc_type='red_cyan'): rc_qc_dir = spec.qc_red_cyan_dirs elif qc_type == 'grey': rc_qc_dir = spec.qc_grey_dirs + elif qc_type == 'labels': + rc_qc_dir = spec.qc_inverted_labels for grid in oris: spec.specimen_id spec_title = f'{spec.line_id}: {spec.specimen_id}' grid.next_row(title=spec_title) + # s = list((rc_qc_dir / grid.title).iterdir()) for img_path in natsorted((rc_qc_dir / grid.title).iterdir(), key=lambda x: x.stem): - relpath = Path(os.path.relpath(img_path, outdir)) + img_path = img_path.resolve() + outdir = outdir.resolve() + + if single_file: # abspath + path_to_use = img_path + else: # Relative path + path_to_use = Path(os.path.relpath(img_path, outdir)) + img_caption = f'{truncate_str(img_path.stem, 30)}' tooltip = f'{spec.line_id}:{spec.specimen_id}:{img_path.stem}' - grid.next_image(relpath, img_caption, tooltip) + grid.next_image(path_to_use, img_caption, tooltip) for grid in oris: ori_out = outdir / f'{grid.title}.html' - grid.save(ori_out) + grid.save(ori_out, height) -def run(reg_root: Path, out_root: Path): +def run(reg_root: Path, out_root: Path, height): rc_dir = out_root / 'red_cyan' rc_dir.mkdir(exist_ok=True) - make_grid(reg_root, rc_dir, 'red_cyan') - + make_grid(reg_root, rc_dir, 'red_cyan', height=height) + # g_dir = out_root / 'greyscales' g_dir.mkdir(exist_ok=True) - make_grid(reg_root, g_dir, 'grey') + make_grid(reg_root, g_dir, 'grey', height=height) + g_dir = out_root / 'inverted_labels' + g_dir.mkdir(exist_ok=True) + try: + make_grid(reg_root, g_dir, 'labels', height) + except FileNotFoundError: + print('Cannot find inverted label overlays. Skipping') if __name__ =='__main__': - import sys - run(Path(sys.argv[1]), Path(sys.argv[2])) \ No newline at end of file + + import argparse + parser = argparse.ArgumentParser("Genate HTML registration image reports") + + parser.add_argument('-i', '--indir', dest='indir', help='A lama registration output directory containing one or more line directories', + required=True) + parser.add_argument('-o', '--out', dest='out', help='output directory', + required=True) + parser.add_argument('-height', '--height', dest='height', help='The height of the images. eg "auto", 200px', + required=False, default='auto') + + args = parser.parse_args() + run(Path(args.indir), Path(args.out), args.height) \ No newline at end of file diff --git a/lama/qc/common.py b/lama/qc/common.py new file mode 100644 index 00000000..2c8818c6 --- /dev/null +++ b/lama/qc/common.py @@ -0,0 +1,21 @@ +from pathlib import Path +from typing import Tuple + + +def final_red_cyan_iterator(root, orientation='sagittal') -> Tuple[Path, str, str]: + """ + Get the red/cyan overlay of the final registered image + + Parameters + ---------- + root + Directory containing one or more lama lines directories + orientation + sagittal, coronal or axial + Returns + ------- + 0: Path to the image + 1: line_id + 2: specimen id + """ + for line_dir in root.it \ No newline at end of file diff --git a/lama/qc/folding.py b/lama/qc/folding.py new file mode 100644 index 00000000..646e690c --- /dev/null +++ b/lama/qc/folding.py @@ -0,0 +1,40 @@ +import numpy as np +import pandas as pd +from lama import common +from typing import Union +from pathlib import Path + + +def folding_report(jac_array, label_map: np.ndarray, label_info: Union[pd.DataFrame, str, Path] = None, outdir=None): + """ + Write out csv detailing the presence of folding per organ + """ + if jac_array is None: + return + labels = np.unique(label_map) + + if not isinstance(label_info, pd.DataFrame): + label_info = pd.read_csv(label_info, index_col=0) + + result = [] + for label in labels: + # print(label) + if label == 0: + continue + jac_label = jac_array[label_map == label] + label_size = jac_label.size + num_neg_vox = jac_label[jac_label < 0].size + total_folding = np.sum(jac_label[jac_label < 0]) + result.append([label, label_size, num_neg_vox, total_folding]) + + df = pd.DataFrame.from_records(result) + df.columns = ['label', 'label_size', 'num_neg_voxels', 'summed_folding'] + df.set_index('label', drop=True, inplace=True) + + if label_info is not None: + df = df.merge(label_info[['label_name']], left_index=True, right_index=True) + + if outdir: + df.to_csv(outdir / common.FOLDING_FILE_NAME) + else: + return df \ No newline at end of file diff --git a/lama/qc/formatting.py b/lama/qc/formatting.py new file mode 100644 index 00000000..c0bb9e8d --- /dev/null +++ b/lama/qc/formatting.py @@ -0,0 +1,33 @@ +""" +Add the scientific notation to the axes label +Adapted from: https://peytondmurray.github.io/coding/fixing-matplotlibs-scientific-notation/# +""" + + +def label_offset(ax, axis=None): + + if axis == "y" or not axis: + fmt = ax.yaxis.get_major_formatter() + ax.yaxis.offsetText.set_visible(False) + set_label = ax.set_ylabel + label = ax.get_ylabel() + + elif axis == "x" or not axis: + fmt = ax.xaxis.get_major_formatter() + ax.xaxis.offsetText.set_visible(False) + set_label = ax.set_xlabel + label = ax.get_xlabel() + + def update_label(event_axes): + offset = fmt.get_offset() + if offset == '': + set_label("{}".format(label)) + else: + set_label("{} {}".format(label, offset)) + return + + ax.callbacks.connect("ylim_changed", update_label) + ax.callbacks.connect("xlim_changed", update_label) + ax.figure.canvas.draw() + update_label(None) + return diff --git a/lama/qc/img_grid.py b/lama/qc/img_grid.py index 62efc033..01ea4f97 100644 --- a/lama/qc/img_grid.py +++ b/lama/qc/img_grid.py @@ -4,6 +4,7 @@ from pathlib import Path import shutil +import base64 class HtmlGrid: @@ -11,7 +12,7 @@ class HtmlGrid: Create a grid of images """ - def __init__(self, title): + def __init__(self, title, single_file=False): self.title = title padding = 20 margin_bottom = 20 @@ -24,6 +25,7 @@ def __init__(self, title): '\n' '
{}
\n').format(title) self.num_rows = 0 + self.single_file = single_file def next_row(self, title: str = ''): # End the old row @@ -36,30 +38,48 @@ def next_row(self, title: str = ''): self.html += '
\n' def next_image(self, img_path: Path, caption: str = '', tooltip: str = ''): + # data_uri = base64.b64encode(open('Graph.png', 'rb').read()).decode('utf-8') + # img_tag = ''.format(data_uri) + # print(img_tag) + if self.single_file: + with open(img_path , 'rb') as fh: + data_uri = base64.b64encode(fh.read()).decode('ascii') + + img_tag = ''.format(data_uri) + else: + img_tag = f'' + self.html += ('
\n' - '\n' + '{}\n' '
{}
' - '
').format(tooltip, img_path, str(caption)) + '
').format(tooltip, img_tag, str(caption)) - def save(self, outpath: Path): + def save(self, outpath: Path, height=600, width=None): # close the final row self.html += '\n' # close the file self.html += '\n' with open(outpath, 'w') as fh: fh.write(self.html) - fh.write(self.get_css()) + fh.write(self.get_css(height, width)) + + def get_css(self, height=300, width=None): + if width: + how = f'width:{width}' + else: + how = f'height:{height}' + - def get_css(self): css = ('') + 'body{{font-family: Arial}}\n' + '.title{{width: 100%; padding: 2px; background-color: lightblue; margin-bottom: 5px}}\n' + '.row{{auto;white-space: nowrap;}}\n' + '.image{{display: inline-block;text-align: center;color: white; position:relative}}\n' + # 'img{white-space: nowrap; height: 100%; width: auto; max-width: MaxSize; max-height: MaxSize;}' + 'img{{white-space: nowrap; {}}}\n' + '.clear{{clear: both}}\n' + '.row_label{{font-size: 2em}}\n' + '.top-left{{position: absolute;top: 8px;left: 16px;}}' + '').format(how) return css diff --git a/lama/qc/interative_qc.py b/lama/qc/interative_qc.py new file mode 100644 index 00000000..16aa0098 --- /dev/null +++ b/lama/qc/interative_qc.py @@ -0,0 +1,214 @@ +""" +Iterate over qc images from a lama run +For each specimen + show a series of qc images + in terminal asccept options such as f(fail), p(pass), c(check) + After option then add optional comment + Add row to a datframe + Save dataframe + +Example +------- +collate_qc_new.py output/baseline/output qc_status.csv + +""" +from pathlib import Path +from skimage.io import imread +from natsort import natsorted +from itertools import chain +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from screeninfo import get_monitors +from mpl_toolkits.axes_grid1 import ImageGrid +from lama.qc.img_grid import HtmlGrid +import os + +from lama.paths import DataIterator, SpecimenDataPaths + + +def show_(fig, img_path, row, ncol, idx, cmap=None, flip=False): + fig.add_subplot(row, ncol, idx) + img = imread(img_path) + if flip: + img = np.flipud(img) + plt.imshow(img, cmap=cmap) + plt.gca().set_axis_off() + # plt.subplots_adjust(top=1, bottom=0, right=1, left=0, + # hspace=0, wspace=0) + plt.margins(0, 0) + + +def do_qc(root: Path, csv_or_dir: Path, html=False): + """ + + Parameters + ---------- + root + A Lama registrtion root for a line (or the baselines) containing multiple specimens + csv + Qc csv + html + if True, write to html for prettier formatiing + """ + # Create series of images specimen-based view + # mpl.rcParams['figure.raise_window'] + d = DataIterator(root) + + spec: SpecimenDataPaths + + status_map = { + 'f': 'failed', + 'p': 'passed', + 'c': 'check' + } + + if csv_or_dir.is_dir(): + interactive = False + csv_or_dir = Path(csv_or_dir) + else: + interactive = True + plt.ion() + + if csv_or_dir.is_file(): + df = pd.read_csv(csv_or_dir, index_col=0) + else: + df = pd.DataFrame(columns=['line', 'spec_id', 'status', 'comment']) + + # get first monitor. Could also look for largest monitor? + mon = get_monitors()[0] + width = mon.width + width = width - (width * 0.1) # Save som sapce for the console + height = mon.height + dpi = 100 + # What size does the figure need to be in inches to fit the image? + figsize = width / float(dpi), height / float(dpi) + f = plt.figure(figsize=figsize, dpi=dpi) + + grid = ImageGrid(f, 111, # similar to subplot(111) + nrows_ncols=(3, 5), # creates 2x2 grid of axes + axes_pad=0.1, # pad between axes in inch. + share_all=False, + aspect=True + ) + + for i, spec in enumerate(d): + + try: + spec.setup() + except FileNotFoundError as e: + print(f'Skipping {spec.specimen_id}\n{e}') + continue + + if interactive: + if spec.specimen_id in df.spec_id.values: + print(f'skipping {spec.specimen_id}') + continue + print(f'QC {spec.specimen_id}') + + # Get the overlays. Use natsort to sort asceding by the slice number file suffix + ol_dir = spec.qc_inverted_labels + ax_ol = natsorted([x for x in (ol_dir / 'axial').iterdir()]) + sa_ol = natsorted([x for x in (ol_dir / 'sagittal').iterdir()]) + co_ol = natsorted([x for x in (ol_dir / 'coronal').iterdir()]) + + reg_dir = spec.qc_grey_dirs + ax_ol_r = natsorted((reg_dir / 'axial').iterdir(), key= lambda x: x.name)[-1] + sa_ol_r = natsorted((reg_dir / 'sagittal').iterdir() ,key= lambda x: x.name)[-1] + co_ol_r = natsorted((reg_dir / 'coronal').iterdir(), key= lambda x: x.name)[-1] + + ax_ol.append(ax_ol_r) + sa_ol.append(sa_ol_r) + co_ol.append(co_ol_r) + + all_p = chain(co_ol, ax_ol, sa_ol) + + i = 0 + + if not html: # mpl jpg output + + for ax, im in zip(grid, all_p): + img = imread(im) + if len(img.shape) == 2: + ax.imshow(img, cmap='gray') + else: + ax.imshow(img) + i += 1 + + f.suptitle(f'{spec.line_id} - {spec.specimen_id}') + f.tight_layout() + + if interactive: + plt.show() + # plt.pause(0.1) + while True: + response = input("Enter status and optional comment: ") + status = response.split()[0] + if status not in status_map.keys(): + print(f'status must be one of {status_map.keys()}') + else: + break + + comment = ' '.join(response.split()[1:]) + + df.loc[len(df)] = [spec.line_id, spec.specimen_id, status, comment] + df.to_csv(csv_or_dir) + else: + line_out_dir = csv_or_dir / spec.line_id + line_out_dir.mkdir(exist_ok=True) + spec_out_file = line_out_dir / f'{spec.specimen_id}.jpg' + plt.savefig(spec_out_file, quality=80, optimize=True, progressive=True) + + else: # Html output + html_grid = HtmlGrid(spec.specimen_id) + + line_out_dir = csv_or_dir / spec.line_id + line_out_dir.mkdir(exist_ok=True) + spec_out_file = line_out_dir / f'{spec.specimen_id}.html' + + for g, img in enumerate(all_p): + if g % 5 == 0: + html_grid.next_row('') + img: Path + img_rel = os.path.relpath(img, line_out_dir) + html_grid.next_image(img_rel, '') + + html_grid.save(spec_out_file, width=400) + + +if __name__ =='__main__': + import argparse + import sys + parser = argparse.ArgumentParser("Genate inverted label overaly plots. Multliple slices for each orietation") + + parser.add_argument('-i', '--indir', dest='indir', + help='A lama registration output directory containing one or more line directories', + required=True) + parser.add_argument('-c', '--csv', dest='csv', + help='Use script interatively. View one specimen at a time and write qc status to this csv', + required=False, default=None) + parser.add_argument('-d', '--dir', dest='dir', + help='Use script non-interatively. Write qc images to dir in line subfolders', + required=False, default=None) + parser.add_argument('--html', dest='html', + help='Use with -d. write to html instead of using matplotlib for larger images', + required=False, default=None, action='store_true') + parser.add_argument('-s', '--single_file', dest='single_file', + help='Embed images within the the html file (Use --html option to). Makes portable html files' + 'that are not tied to the LAMA folder', + required=False, default=None, action='store_true') + args = parser.parse_args() + + if args.csv is None and args.dir is None: + sys.exit("Supply either a csv path (-c) or a directory (-d)") + if args.csv is not None and args.dir is not None: + sys.exit("Supply either a csv path (-c) or a directory (-d)") + + if args.csv is not None: + csv_or_dir = Path(args.csv) + else: + csv_or_dir = Path(args.dir) + if not csv_or_dir.is_dir(): + sys.exit('-d must be a directory') + + do_qc(Path(args.indir), csv_or_dir, args.html) \ No newline at end of file diff --git a/lama/qc/organ_vol_plots.py b/lama/qc/organ_vol_plots.py index 3d781710..f697e181 100644 --- a/lama/qc/organ_vol_plots.py +++ b/lama/qc/organ_vol_plots.py @@ -20,10 +20,10 @@ import matplotlib.pyplot as plt from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas import pandas as pd +from logzero import logger as logging -from lama.common import getfile_endswith -from plotting import formatting - +from lama.common import getfile_startswith_endswith +from lama.qc import formatting organ_vol = 'organ volume' # Y label wev = 'whole embryo volume' # x label for scatter plots @@ -75,22 +75,35 @@ def hist(values): plt.close() -def make_plots(mut_lines_dir: Path, wt_organ_vols: pd.DataFrame, wt_staging: pd.DataFrame, label_meta_file: str, - stats_dir: Path, skip_no_analysis=False, organ_subset: List=[]): +def make_plots(mut_lines_dir: Path, + wt_organ_vols: pd.DataFrame, + wt_staging: pd.DataFrame, + label_meta_file: Path, + stats_root_dir: Path, + skip_no_analysis= False, + organ_subset: List = [], + extra_dir: Path = Path('')): """ Parameters ---------- mut_lines_dir + Lama registration root. eg: mutants/output with each sibdir containing a line wt_organ_vols + Aggregated organ volumes for each baseline wt_staging + Aggregated staging info for each baseline label_meta_file - stats_dir + CSV of atlas metadata + stats_root_dir + Contains a folder for each line skip_no_analysis + If there is a no_analysis=True flag in the meta data csv, skip this organ organ_subset plot only the labels with these label numbers. Or can be used to order the output of the plots - - + extra_dir + Bit of a bodge, but if doing the non-permutation-based stats, the organ vol csv is in a directory below. + Give the name here (currently 'organ_volumes') """ label_meta = pd.read_csv(label_meta_file, index_col=0) @@ -99,18 +112,19 @@ def make_plots(mut_lines_dir: Path, wt_organ_vols: pd.DataFrame, wt_staging: pd. for mut_line_dir in mut_lines_dir.iterdir(): - if not mut_line_dir.name in ['acan-g11']: - continue - if not mut_line_dir.is_dir(): continue print(mut_line_dir.name) - stats_line_dir = stats_dir / mut_line_dir.name + + stats_line_dir = stats_root_dir / mut_line_dir.name / extra_dir # extra_dir does nothing if == '' line = mut_line_dir.name - stats_result_file = getfile_endswith(stats_line_dir, '.csv') + #TODO: Get file by startswith line name and endswith extension (Could be date of analysis in middle) + # Rather tan just getting any CSVs in there + + stats_result_file = getfile_startswith_endswith(stats_line_dir, line, '.csv') # Get mutant staging and organ volumes line_vols = [] @@ -136,31 +150,30 @@ def make_plots(mut_lines_dir: Path, wt_organ_vols: pd.DataFrame, wt_staging: pd. staging_df.rename(columns={'staging': wev}, inplace=True) vol_df = pd.concat([wt_organ_vols, df_vol_mut]) - try: - hits: pd.DataFrame = df_hits[df_hits['significant_cal_p'] == True] - except Exception: - pass - if skip_no_analysis: - # Bodge until I fix getting of csvs by name - if 'no_analysis' not in hits: - hits = hits.merge(label_meta[['organ_system_name', 'no_analysis']], how='inner', left_index=True, right_index=True) - else: - hits = hits.merge(label_meta[['organ_system_name']], how='inner', left_index=True, right_index=True) - hits = hits[hits['no_analysis'] != True] + if 'significant_cal_p' in df_hits: # 'permutation stats + hits: pd.DataFrame = df_hits[df_hits['significant_cal_p'] == True] + elif 'significant_bh_q_5' in df_hits: + hits: pd.DataFrame = df_hits[df_hits['significant_bh_q_5'] == True] else: - hits = hits.merge(label_meta[['organ_system_name']], how='inner', left_index=True, right_index=True) + logging.error("Plots not made: Stats output file must have 'significant_cal_p' or 'significant_bh_q_5' column") + + if ('organ_system_name' in label_meta.columns) and ('organ_system_name' not in hits): + # Sort by organ system if present in atlas metadata + hits = hits.merge(label_meta[['organ_system_name']], how='left', left_index=True, right_index=True) + hits.sort_values(by='organ_system_name', inplace=True) + + if skip_no_analysis: + # Skip organ that are flagged with no_analysis in the atlas metadata file + if 'no_analysis' not in hits: + hits = hits[hits['no_analysis'] != True] if len(hits) < 1: + logging.info(f'No hits, so Skipping organ vol plots for: {mut_line_dir.name}') continue - try: - hits.sort_values(by='organ_system_name', inplace=True) - except Exception: - pass - st = wt_staging['staging'] normed_wt = wt_organ_vols.div(st, axis=0) @@ -178,8 +191,6 @@ def make_plots(mut_lines_dir: Path, wt_organ_vols: pd.DataFrame, wt_staging: pd. fig_scat = Figure(figsize=(figsize_x, figsize_y)) FigureCanvas(fig_scat) - boxprops = dict(linewidth=1, facecolor=None) - if organ_subset: labels_to_plot = organ_subset @@ -196,8 +207,6 @@ def make_plots(mut_lines_dir: Path, wt_organ_vols: pd.DataFrame, wt_staging: pd. axes.tick_params(labelsize=18) axes.set_yticklabels([]) - - label = str(label) wt = normed_wt[[label]] @@ -212,16 +221,14 @@ def make_plots(mut_lines_dir: Path, wt_organ_vols: pd.DataFrame, wt_staging: pd. min_ = df[organ_vol].min() - (df[organ_vol].min() * 0.1) max_ = df[organ_vol].max() + (df[organ_vol].max() * 0.1) - voxel_size = 14.0 - df[organ_vol] = df[organ_vol] * voxel_size - df[wev] = df[organ_vol] * voxel_size - sns.boxplot(x="genotype", y=organ_vol, data=df, orient='v', - ax=axes, boxprops=boxprops) + + # sns.boxplot(x="genotype", y="organ volume", data=df, orient='v', + # ax=axes, boxprops=boxprops) axes.tick_params(labelsize=18) - ax = sns.swarmplot(x="genotype", y=organ_vol, data=df, orient='v', + ax = sns.swarmplot(x="genotype", y='organ volume', data=df, orient='v', ax=axes) ax.set_ylim(min_, max_) @@ -230,6 +237,8 @@ def make_plots(mut_lines_dir: Path, wt_organ_vols: pd.DataFrame, wt_staging: pd. r, g, b, a = patch.get_facecolor() patch.set_facecolor((r, g, b, 0.0)) + if 'short_name' in label_meta: + label_name = label_meta.at[int(label), 'short_name'] title = label_name.replace('_', ' ') title = title.capitalize() ax.set_ylabel('') @@ -241,15 +250,22 @@ def make_plots(mut_lines_dir: Path, wt_organ_vols: pd.DataFrame, wt_staging: pd. s_axes = fig_scat.add_subplot(numrows, numcol, i + 1) s_axes.tick_params(labelsize=18) + # Get rid of hard-coding + voxel_size = 27.0 + um3_conv_factor = voxel_size ** 3 # To convert voxels to um3 + um3_to_mm3_conv_factor = 1e9 + scattter_df = staging_df.join(vol_df[[label]]).rename(columns={label: organ_vol}) + scattter_df[organ_vol] = (scattter_df[organ_vol] * um3_conv_factor) / um3_to_mm3_conv_factor + scattter_df[wev] = (scattter_df[wev] * um3_conv_factor) / um3_to_mm3_conv_factor scattter_df['normalised_organ_vol'] = scattter_df[organ_vol] / scattter_df[wev] sax = sns.scatterplot(y=organ_vol, x=wev, ax=s_axes, hue='genotype', data=scattter_df) - sax.set(xlabel='Whole embryo volume ($\u03BC$m^3)') - sax.set(ylabel='Organ volume (($\u03BC$m^3)') + sax.set(xlabel='Whole embryo volume (mm^3)') + sax.set(ylabel='Organ volume (mm^3)') sax.set_title(title, fontsize=16) sax.ticklabel_format(style='sci',scilimits=(0, 0)) @@ -262,14 +278,15 @@ def make_plots(mut_lines_dir: Path, wt_organ_vols: pd.DataFrame, wt_staging: pd. fig.subplots_adjust(top=0.8) # TODO fix this for larger plot fig.suptitle(line, fontsize=30, y=0.98) - fig.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) + # fig.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) if skip_no_analysis: box_name = f'{line}_boxplots_no_analysis.png' else: box_name = f'{line}_boxplots.png' - fig.savefig(stats_line_dir / box_name) + # TODO: Fix the boxplot or swarm plot output + # fig.savefig(stats_line_dir / box_name) fig_scat.subplots_adjust(top=0.8, wspace=0.35, hspace=0.4) # TODO fix this for larger plot fig_scat.suptitle(line, fontsize=30, y=0.98) diff --git a/lama/qc/qc_images.py b/lama/qc/qc_images.py index 7ab7db67..5ad5ce29 100644 --- a/lama/qc/qc_images.py +++ b/lama/qc/qc_images.py @@ -11,16 +11,21 @@ import numpy as np from skimage.exposure import rescale_intensity from skimage.transform import match_histograms +from skimage import exposure from skimage.io import imsave +from skimage.measure import regionprops from lama import common -from lama.elastix import IGNORE_FOLDER +from lama.elastix import RESOLUTION_IMGS_DIR from lama.paths import SpecimenDataPaths -MOVING_RANGE = (0, 180) # Rescale the moving image to these values for the cyan/red overlay +INTENSITY_RANGE = (0, 255) # Rescale the moving image to these values for the cyan/red overlay -def make_qc_images(lama_specimen_dir: Path, target: Path, outdir: Path): +def make_qc_images(lama_specimen_dir: Path, + target: Path, + outdir: Path, + mask: Path): """ Generate mid-slice images for quick qc of registration process. @@ -32,6 +37,8 @@ def make_qc_images(lama_specimen_dir: Path, target: Path, outdir: Path): The target image to display in cyan outdir Where to put the qc images + mask + Used to identify the embryo in the image so we can display useful info Notes ----- @@ -57,35 +64,47 @@ def make_qc_images(lama_specimen_dir: Path, target: Path, outdir: Path): for i, (stage, img_path) in enumerate(paths.registration_imgs()): img = common.LoadImage(img_path).array - make_red_cyan_qc_images(target, img, red_cyan_dir, greyscale_dir, img_path.stem, i, stage) + _make_red_cyan_qc_images(target, img, red_cyan_dir, greyscale_dir, img_path.stem, i, stage) if paths.inverted_labels_dirs: - # TODO: First reg img will be either the rigid-registered image if tehre are no resolution intermediate images, + # TODO: First reg img will be either the rigid-registered image if there are no resolution intermediate images, # which is relly what we want want. Other wise it will be the first resolotio image, which will do for now, # as they are usually very similar - first_reg_dir = paths.reg_dirs[0] + try: + first_reg_dir = paths.reg_dirs[0] + except IndexError: # Todo if only one stage of reg, overlay on inputs + logging.info('skipping inverted label overlay') + return # if we had rigid, affine , deformable stages. We would need to overlay rigid image ([0]) with the label that # had finally had the inverted affine transform applied to it ([1) inverted_label_dir = paths.inverted_labels_dirs[1] inverted_label_overlays_dir = outdir / 'inverted_label_overlay' inverted_label_overlays_dir.mkdir(exist_ok=True) - overlay_labels(first_reg_dir, - inverted_label_dir, - inverted_label_overlays_dir) + _overlay_labels(first_reg_dir, + inverted_label_dir, + inverted_label_overlays_dir, + mask=mask) -def overlay_labels(first_stage_reg_dir: Path, - inverted_labeldir: Path, - out_dir_labels: Path): +def _overlay_labels(first_stage_reg_dir: Path, + inverted_labeldir: Path, + out_dir_labels: Path, + mask: Path=None): """ Overlay the first registrated image (rigid) with the corresponding inverted labels It depends on the registered volumes and inverted label maps being named identically TODO: Add axial and coronal views. """ + if mask: + mask = sitk.GetArrayFromImage(sitk.ReadImage(str(mask))) + rp = regionprops(mask) + # Get the largest label. Likley only one from the mask + mask_props = list(reversed(sorted(rp, key=lambda x: x.area)))[0] + bbox = mask_props['bbox'] - for vol_path in common.get_file_paths(first_stage_reg_dir, ignore_folder=IGNORE_FOLDER): + for vol_path in common.get_file_paths(first_stage_reg_dir, ignore_folders=RESOLUTION_IMGS_DIR): vol_reader = common.LoadImage(vol_path) @@ -104,27 +123,71 @@ def overlay_labels(first_stage_reg_dir: Path, cast_img = sitk.Cast(sitk.RescaleIntensity(vol_reader.img), sitk.sitkUInt8) arr = sitk.GetArrayFromImage(cast_img) - slice_ = np.flipud(arr[:, :, arr.shape[2] // 2]) + base = splitext(basename(label_reader.img_path))[0] l_arr = label_reader.array - l_slice_ = np.flipud(l_arr[:, :, l_arr.shape[2] // 2]) - base = splitext(basename(label_reader.img_path))[0] - out_path = join(out_dir_labels, base + '.png') - blend_8bit(slice_, l_slice_, out_path) + def sag(idx_): + slice_sag = np.flipud(arr[:, :, idx_]) + l_slice_sag = np.flipud(l_arr[:, :, idx_]) + sag_dir = out_dir_labels / 'sagittal' + sag_dir.mkdir(exist_ok=True) + out_path_sag = sag_dir / f'{base}_{idx_}.png' + _blend_8bit(slice_sag, l_slice_sag, out_path_sag) + if mask is None: # get a few slices from middle + sag_indxs = np.linspace(0, arr.shape[2], 8, dtype=np.int)[2:-2] + else: + sag_start = bbox[2] + sag_end = bbox[5] + sag_indxs = np.linspace(sag_start, sag_end, 6, dtype=np.int)[1:-1] # Take the 4 inner slices + for idx in sag_indxs: + sag(idx) + + def ax(idx_): + slice_ax = arr[idx_, :, :] + l_slice_ax = l_arr[idx_, :, :] + ax_dir = out_dir_labels / 'axial' + ax_dir.mkdir(exist_ok=True) + out_path_ax = ax_dir / f'{base}_{idx_}.png' + _blend_8bit(slice_ax, l_slice_ax, out_path_ax) + if mask is None: # get a few slices from middle + ax_indxs = np.linspace(0, arr.shape[0], 8, dtype=np.int)[2:-2] + else: + ax_start = bbox[0] + ax_end = bbox[3] + ax_indxs = np.linspace(ax_start, ax_end, 6, dtype=np.int)[1:-1] + for idx in ax_indxs: + ax(idx) + + def cor(idx_): + slice_cor = np.flipud(arr[:, idx_, :]) + l_slice_cor = np.flipud(l_arr[:, idx_, :]) + cor_dir = out_dir_labels / 'coronal' + cor_dir.mkdir(exist_ok=True) + out_path_cor = cor_dir / f'{base}_{idx_}.png' + _blend_8bit(slice_cor, l_slice_cor, out_path_cor) + if mask is None: # get a few slices from middle + cor_indxs = np.linspace(0, arr.shape[1], 8, dtype=np.int)[2:-2] + else: + cor_start = bbox[1] + cor_end = bbox[4] + cor_indxs = np.linspace(cor_start, cor_end, 6, dtype=np.int)[1:-1] + for idx in cor_indxs: + cor(idx) + else: logging.info('No inverted label found. Skipping creation of inverted label-image overlay') -def blend_8bit(gray_img: np.ndarray, label_img: np.ndarray, out: Path, alpha: float=0.18): +def _blend_8bit(gray_img: np.ndarray, label_img: np.ndarray, out: Path, alpha: float=0.18): overlay_im = sitk.LabelOverlay(sitk.GetImageFromArray(gray_img), sitk.GetImageFromArray(label_img), alpha, 0) - sitk.WriteImage(overlay_im, out) + sitk.WriteImage(overlay_im, str(out)) -def red_cyan_overlay(slice_1, slice_2) -> np.ndarray: +def _red_cyan_overlay(slice_1, slice_2) -> np.ndarray: rgb = np.zeros([*slice_1.shape, 3], np.uint8) rgb[..., 0] = slice_1 @@ -133,13 +196,13 @@ def red_cyan_overlay(slice_1, slice_2) -> np.ndarray: return rgb -def make_red_cyan_qc_images(target: np.ndarray, - specimen: np.ndarray, - out_dir: Path, - grey_cale_dir: Path, - name: str, - img_num: int, - stage_id: str) -> List[np.ndarray]: +def _make_red_cyan_qc_images(target: np.ndarray, + specimen: np.ndarray, + out_dir: Path, + grey_cale_dir: Path, + name: str, + img_num: int, + stage_id: str) -> List[np.ndarray]: """ Create a cyan red overlay Parameters @@ -167,8 +230,9 @@ def get_ori_dirs(root: Path): if target.shape != specimen.shape: raise ValueError('target and specimen must be same shape') - specimen = np.clip(specimen, 0, 255) - specimen = rescale_intensity(specimen, out_range=MOVING_RANGE) + # specimen = np.clip(specimen, 0, 255) + specimen = rescale_intensity(specimen, out_range=INTENSITY_RANGE).astype(np.uint8) + target = rescale_intensity(target, out_range=INTENSITY_RANGE).astype(np.uint8) def get_slices(img): slices = [] @@ -179,8 +243,18 @@ def get_slices(img): t = get_slices(target) s = get_slices(specimen) - # histogram match the specimen to the target - s = [match_histograms(s_, reference=t_) for (s_, t_) in zip(s, t)] + # histogram match the specimen to the target for each orientation slice + # This produces bad results sometimes. Move to adaptive histogram equalization + + # Todo: histogram matchng swtitched off as it makes the specimen disapaer. Fix this + # s = [exposure.equalize_adapthist(x, clip_limit=0.03) for x in s] + # Try mean normalisation + # for ti, si in zip(t, s): + # med_t = ti[ti > 5].mean() + # med_s = si[si > 5].mean() + # diff = med_t - med_s + # si += int(diff) + s = [match_histograms(si, ti) for si, ti in zip(s, t)] rc_oris = get_ori_dirs(out_dir) grey_oris = get_ori_dirs(grey_cale_dir) @@ -188,9 +262,10 @@ def get_slices(img): # put slices in folders by orientation for i in range(len(oris)): grey = s[i] - rgb = red_cyan_overlay(s[i], t[i]) - rgb = np.flipud(rgb) - grey = np.flipud(grey) + rgb = _red_cyan_overlay(s[i], t[i]) + if not oris[i] == 'axial': + rgb = np.flipud(rgb) + grey = np.flipud(grey) imsave(rc_oris[i][0] / f'{img_num}_{stage_id}_{name}_{rc_oris[i][1]}.png', rgb) imsave(grey_oris[i][0] / f'{img_num}_{stage_id}_{name}_{rc_oris[i][1]}.png', grey) diff --git a/lama/registration_pipeline/optimise/run_lama_configs_grid.py b/lama/registration_pipeline/optimise/run_lama_configs_grid.py index 4718d132..e90ec096 100644 --- a/lama/registration_pipeline/optimise/run_lama_configs_grid.py +++ b/lama/registration_pipeline/optimise/run_lama_configs_grid.py @@ -1,6 +1,6 @@ """ -Given a input folde of lines (could be centres etc) and a folder of lama configs: -* Run each specimen on a grid node with each config filee +Given a input folder of lines (could be centres etc) and a folder of lama configs: +* Run each specimen on a grid node with each config file (same config as used for grid job runner * Example toml config @@ -60,7 +60,14 @@ def run(config_path): root_reg_dir = out_root / lama_config_path.stem root_reg_dir.mkdir(exist_ok=True) - for line_dir in inputs_dir.iterdir(): + # Check if multiple dirs exist in inputs (whch maight be different lines/centres for example + # If only files, we just run them all together + if any([x.is_dir() for x in inputs_dir.iterdir()]): + input_dirs = [x for x in inputs_dir.iterdir() if x.is_dir()] + else: + input_dirs = [inputs_dir] + + for line_dir in input_dirs: for input_ in line_dir.iterdir(): print(f'Specimen: {input_.name}, config: {lama_config_path.name}') @@ -80,7 +87,7 @@ def run(config_path): def run_on_grid(lama_config_path, grid_config): - + # lama_config_path = str(lama_config_path).replace('/mnt', '') c = grid_config cmd = f'{c["grid_cmd"]} "{c["docker_cmd"]} \'{c["lama_cmd"]}\'"' # Now interpolate our values @@ -90,6 +97,7 @@ def run_on_grid(lama_config_path, grid_config): conn.run(cmd, env={'SGE_ROOT': '/grid/dist/GE2011.11p1'}) conn.close() + if __name__ == '__main__': import sys cfg_path = sys.argv[1] diff --git a/lama/registration_pipeline/parallel_average.py b/lama/registration_pipeline/parallel_average.py index 7b8c889e..6fc2ec9f 100644 --- a/lama/registration_pipeline/parallel_average.py +++ b/lama/registration_pipeline/parallel_average.py @@ -54,7 +54,7 @@ def check_stage_done(root_dir) -> bool: return True -def grid_runner(config_path: Path) -> Path: +def job_runner(config_path: Path) -> Path: """ Run the registrations specified in the config file @@ -140,7 +140,6 @@ def grid_runner(config_path: Path) -> Path: print('breaking stage') break - # Get the input for this specimen if i == 0: # The first stage moving = inputs_dir / f'{next_spec_id}.nrrd' @@ -188,4 +187,4 @@ def grid_runner(config_path: Path) -> Path: import sys config_path_ = Path(sys.argv[1]) -grid_runner(config_path_) \ No newline at end of file +job_runner(config_path_) \ No newline at end of file diff --git a/lama/registration_pipeline/parallel_average_pairwise.py b/lama/registration_pipeline/parallel_average_pairwise.py new file mode 100644 index 00000000..3646c5b4 --- /dev/null +++ b/lama/registration_pipeline/parallel_average_pairwise.py @@ -0,0 +1,311 @@ +""" +Create populaiton avegae from pairwise regitrations and distribute the jobs. + +""" + +from pathlib import Path +import traceback +from typing import List +from itertools import permutations +import time +from lama.registration_pipeline.validate_config import LamaConfig +from lama.registration_pipeline.run_lama import generate_elx_parameters, ELX_PARAM_PREFIX +from lama import common +from lama.elastix.elastix_registration import TargetBasedRegistration, PairwiseBasedRegistration +from logzero import logger as logging +import logzero +import SimpleITK as sitk + + + +def mean_transform(pair_reg_dir: Path, + pre_avg_dir: Path, + current_avg_dir) : + + """ + Apply the mean transform and create a populaiton avergae for each stage (latter for vis purpises only) + + Parameters + ---------- + moving_vol_dir + A stage root directory. Subdirectories for each fixed image + + Returns + ------- + + """ + + # This loop gets all the avergaes + + + fixed_vol = list(pre_avg_dir.glob(f'**/{pair_reg_dir.name}.nrrd'))[0] + tp_paths = list(pair_reg_dir.glob('**/TransformParameters.0.txt')) + + out_dir = current_avg_dir / pair_reg_dir.name + out_dir.mkdir(exist_ok=True) + + tp_out_name = f'{pair_reg_dir.name}.txt' + + PairwiseBasedRegistration.generate_mean_tranform(tp_paths, fixed_vol, out_dir, tp_out_name, filetype='nrrd') + + +def get_pairs(inputs_dir): + """ + Given the input directory return the pairwise combinations (in the form name1_name_2 minus extensions: (ids tuple)) + + """ + specimen_ids = [Path(x).stem for x in common.get_file_paths(inputs_dir)] + perms = list(permutations(specimen_ids, r=2)) + d = {} + for p in perms: + k = f'{p[0]}_{p[1]}' + d[k] = p + # k = {(f'{x[0]}_{x[1]}'): x for x in perms} + return d + + +def get_next_pair(stage_dir: Path): + """ + Get the next pair to register. If all started return None + Parameters + ---------- + stage_dir + + Returns + ------- + + """ + + started_dirs = [] + for fixed_root in stage_dir.iterdir(): + if not fixed_root.is_dir(): + continue + for pair_dir in fixed_root.iterdir(): + started_dirs.append(pair_dir.name) + + return started_dirs + + +def do_reg(moving, fixed, stage_dir, pair_name, config, elxparam_path): + + # For each volume keep all the regisrtaitons where it is fixed in this folder + fixed_out_root = stage_dir / fixed.stem + + pair_dir = fixed_out_root / pair_name + pair_dir.mkdir(exist_ok=True, parents=True) + + fixed_mask = None + + # Do the registrations + registrator = TargetBasedRegistration(elxparam_path, + fixed, + pair_dir, + config['filetype'], + config['threads'], + fixed_mask + ) + + registrator.set_target(moving) + registrator.rename_output = False + + registrator.run() + + +def do_stage_reg(pairs: List[str], + stage_status_dir: Path, + reg_stage_dir: Path, + previous_mean_dir, + elastix_param_path, + config, + first=True): + """ + Process a stage. Only return when fully complelted + + Returns + ------- + + """ + finsihed_stage = False + + while True: # Pick up unstarted pairwise registrations. Only break when reg and average complete + + # Check if any specimens left (It's possible the avg is being made but all specimens are registered) + start_dir = stage_status_dir / 'started' + finish_dir = stage_status_dir / 'finished' + failed_dir = stage_status_dir / 'failed' + + + started = list([x.name for x in start_dir.iterdir()]) + pair_keys = list(pairs.keys()) + not_started = set(pair_keys).difference(started) + + if len(not_started) > 0: + next_pair_name = list(not_started)[0] # Some specimens left. Pick up spec_id and process + next_pair = pairs[next_pair_name] + # drop a file in the started dir + start_file = start_dir / next_pair_name + + try: + with open(start_file, 'x') as fh: + if first: # The first stage all inputs in same dir + moving = previous_mean_dir / f'{next_pair[0]}.nrrd' + fixed = previous_mean_dir / f'{next_pair[1]}.nrrd' + else: # Outputs are in individual folders + moving = previous_mean_dir / next_pair[0] / f'{next_pair[0]}.nrrd' + fixed = previous_mean_dir / next_pair[1] / f'{next_pair[1]}.nrrd' + + try: + do_reg(moving, fixed, reg_stage_dir, next_pair_name, config, elastix_param_path) + reg_finished_file = finish_dir / next_pair_name + open(reg_finished_file, 'x').close() + except Exception as e: + failed_reg_file = failed_dir / next_pair_name + + with open(failed_reg_file, 'x') as fh: + traceback.print_exc(file=fh) + raise + + except FileExistsError: + # This is raised if another instance has already touched the file. + continue + + else: # All specimens have at least been started + + # This block waits for all registrations to be finished or check for any failaures + while True: + finished = [x.name for x in finish_dir.iterdir()] + failed = [x.name for x in failed_dir.iterdir()] + + if failed: + raise RuntimeError('Exiting as a failure has been detected') + + not_finished = set(pairs.keys()).difference(finished) + + if not_finished: + print("Wating for specimens to finish reg") + time.sleep(5) + else: + finsihed_stage = True + break + if finsihed_stage: + break + + +def do_mean_transforms(pairs, stage_status_dir, reg_stage_dir, mean_dir, previous_mean_dir, avg_out): + + mean_started_dir = stage_status_dir / 'mean_started' + mean_finished_dir = stage_status_dir / 'mean_finished' + + moving_ids = [] + for moving_vol_dir in reg_stage_dir.iterdir(): + if not moving_vol_dir.is_dir(): + continue + moving_ids.append(moving_vol_dir.name) + + try: + with open(mean_started_dir / moving_vol_dir.name, 'x'): + mean_transform(moving_vol_dir, previous_mean_dir, mean_dir) + mean_finished_file = mean_finished_dir / moving_vol_dir.name + open(mean_finished_file, 'x').close() + except FileExistsError: + continue + + while True: + # Wait for mean transformas to finish + means_finshed = [x.name for x in mean_finished_dir.iterdir()] + means_not_finished = set(moving_ids).difference(means_finshed) + + if len(means_not_finished) > 0: + print('waiting for mean transfroms') + time.sleep(2) + else: + break + # make averge images + avg_started_file = str(avg_out) + 'started' + try: + with open(avg_started_file, 'x'): + img_paths = common.get_file_paths(mean_dir) + avg = common.average(img_paths) + sitk.WriteImage(avg, str(avg_out)) + except FileExistsError: + return # We don't have to wait for avergae to finish as it's not required for the next stage + + +def job_runner(config_path: Path) -> Path: + """ + Run the registrations specified in the config file + + Returns + ------- + The path to the final registrered images + """ + + # Load the lama config + config = LamaConfig(config_path) + + mean_transforms = config_path.parent / 'mean_transforms' + mean_transforms.mkdir(exist_ok=True) + + avg_dir = config_path.parent / 'averages' + avg_dir.mkdir(exist_ok=True) + + # Folder to create logic control files + status_dir = config_path.parent / 'status' + status_dir.mkdir(exist_ok=True) + + # Get list of specimens + inputs_dir = config.options['inputs'] + pairs = get_pairs(inputs_dir) + + previous_mean_dir = inputs_dir + first = True + + for i, reg_stage in enumerate(config['registration_stage_params']): + + stage_id = reg_stage['stage_id'] + avg_out = avg_dir / f'{stage_id}.nrrd' + logging.info(stage_id) + reg_stage_dir = Path(config.stage_dirs[stage_id]) + + # Make stage dir if not made by another instance of the script + reg_stage_dir.mkdir(exist_ok=True, parents=True) + + elastix_stage_parameters = generate_elx_parameters(config, do_pairwise=True)[stage_id] + # Make the elastix parameter file for this stage + elxparam_path = reg_stage_dir / f'{ELX_PARAM_PREFIX}{stage_id}.txt' + + if not elxparam_path.is_file(): + with open(elxparam_path, 'w') as fh: + fh.write(elastix_stage_parameters) + + stage_mean_dir = mean_transforms / stage_id + stage_mean_dir.mkdir(exist_ok=True, parents=True) + + stage_status_dir = status_dir / stage_id + stage_status_started = stage_status_dir / 'started' + stage_status_failed = stage_status_dir / 'failed' + stage_status_finished = stage_status_dir / 'finished' + + stage_status_started.mkdir(exist_ok=True, parents=True) + stage_status_failed.mkdir(exist_ok=True, parents=True) + stage_status_finished.mkdir(exist_ok=True, parents=True) + + stage_tform_started = stage_status_dir / 'mean_started' + stage_tform_finished = stage_status_dir / 'mean_finished' + stage_tform_started.mkdir(exist_ok=True) + stage_tform_finished.mkdir(exist_ok=True) + + do_stage_reg(pairs, stage_status_dir, reg_stage_dir, previous_mean_dir, + elxparam_path, config, first) + + do_mean_transforms(pairs, stage_status_dir, reg_stage_dir, stage_mean_dir, previous_mean_dir, avg_out) + + first = False + previous_mean_dir = stage_mean_dir + + +if __name__ == '__main__': + import sys + config_path_ = Path(sys.argv[1]) + +job_runner(config_path_) diff --git a/lama/registration_pipeline/run_lama.py b/lama/registration_pipeline/run_lama.py index 821c37c2..584ae600 100755 --- a/lama/registration_pipeline/run_lama.py +++ b/lama/registration_pipeline/run_lama.py @@ -121,6 +121,7 @@ from lama.elastix.elastix_registration import TargetBasedRegistration, PairwiseBasedRegistration from lama.staging import staging_metric_maker from lama.qc.qc_images import make_qc_images +from lama.qc.folding import folding_report from lama.stats.standard_stats.data_loaders import DEFAULT_FWHM, DEFAULT_VOXEL_SIZE from lama.elastix import INVERT_CONFIG, REG_DIR_ORDER from lama.monitor_memory import MonitorMemory @@ -130,6 +131,8 @@ ELX_PARAM_PREFIX = 'elastix_params_' # Prefix the generated elastix parameter files PAD_INFO_FILE = 'pad_info.yaml' +temp_debug_fix_folding = True + # Set the spacing and origins before registration SPACING = (1.0, 1.0, 1.0) @@ -189,7 +192,8 @@ def run(configfile: Path): final_registration_dir = run_registration_schedule(config) - make_deformations_at_different_scales(config) + neg_jac = make_deformations_at_different_scales(config) + folding_report(neg_jac, config['output_dir'], config['label_info']) create_glcms(config, final_registration_dir) @@ -215,19 +219,14 @@ def run(configfile: Path): fh.write(f'{reg_stage["stage_id"]}\n') if not no_qc: - if config['skip_transform_inversion']: - inverted_label_overlay_dir = None - else: - inverted_label_overlay_dir = config.mkdir('inverted_label_overlay_dir') - - # registered_midslice_dir = config.mkdir('registered_midslice_dir') - make_qc_images(config.config_dir, config['fixed_volume'], qc_dir) + mask = config['inverted_stats_masks'] / 'rigid' + if not mask.is_file(): + mask = None + make_qc_images(config.config_dir, config['fixed_volume'], qc_dir, mask=mask) mem_monitor.stop() - - return True @@ -352,6 +351,14 @@ def run_registration_schedule(config: LamaConfig) -> Path: ------- The path to the final registrered images """ + st = config['stage_targets'] + if st: + with open(st, 'r') as stfh: + stage_targets = yaml.load(stfh)['targets'] + if len(config['registration_stage_params']) != len(stage_targets): + logging.error(f'Len stage targets: {len(stage_targets)}') + logging.error(f'Len reg stages: {len(config["registration_stage_params"])}') + raise LamaConfigError("restage len != number of registration stages") # Create a folder to store mid section coronal images to keep an eye on registration process if not config['no_qc']: @@ -360,6 +367,9 @@ def run_registration_schedule(config: LamaConfig) -> Path: elastix_stage_parameters = generate_elx_parameters(config, do_pairwise=config['pairwise_registration']) regenerate_target = config['generate_new_target_each_stage'] + if regenerate_target and st: + raise LamaConfigError('cannot have regenerate_target and stage_targets') + if regenerate_target: logging.info('Creating new target each stage for population average creation') else: @@ -369,7 +379,10 @@ def run_registration_schedule(config: LamaConfig) -> Path: moving_vols_dir = config['inputs'] # Set the fixed volume up for the first stage. This will checnge each stage if doing population average - fixed_vol = config['fixed_volume'] + if st: + fixed_vol = stage_targets[0] + else: + fixed_vol = config['fixed_volume'] for i, reg_stage in enumerate(config['registration_stage_params']): @@ -427,8 +440,13 @@ def run_registration_schedule(config: LamaConfig) -> Path: if (not config['pairwise_registration']) or (config['pairwise_registration'] and euler_stage): registrator.set_target(fixed_vol) + if reg_stage['elastix_parameters']['Transform'] == 'BSplineTransform': + logging.info(f'Folding correction for stage {stage_id} set') + registrator.fix_folding = config['fix_folding'] # Curently only works for TargetBasedRegistration + registrator.run() # Do the registrations for a single stage + # Make average from the stage outputs average_path = join(config['average_folder'], '{0}.{1}'.format(stage_id, config['filetype'])) registrator.make_average(average_path) @@ -443,6 +461,8 @@ def run_registration_schedule(config: LamaConfig) -> Path: if i + 1 < len(config['registration_stage_params']): if regenerate_target: fixed_vol = average_path # The avergae from the previous step + elif st: + fixed_vol = stage_targets[i+1] moving_vols_dir = stage_dir # Set the output of the current stage top be the input of the next @@ -454,7 +474,7 @@ def run_registration_schedule(config: LamaConfig) -> Path: def create_glcms(config: LamaConfig, final_reg_dir): """ Create grey level co-occurence matrices. This is done in the main registration pipeline as we don't - want to have to create GLCMs for the wildtypes multiple times when doing phenotype detection + want to have to create GLCMs for the wildtypes multiple times when doing phenotype detection. """ if not config['glcm']: return diff --git a/lama/registration_pipeline/validate_config.py b/lama/registration_pipeline/validate_config.py index 543fd9b8..9a8b535a 100755 --- a/lama/registration_pipeline/validate_config.py +++ b/lama/registration_pipeline/validate_config.py @@ -5,6 +5,7 @@ import difflib from pathlib import Path from collections import OrderedDict +from typing import Union, Dict from logzero import logger as logging import numpy as np @@ -24,24 +25,42 @@ class LamaConfigError(BaseException): class LamaConfig: """ - Contains information derived form the user config such as paths - The keys from output_paths, target_names and input_options will be avaialble via the __getitem__ + Contains information derived form the user config such as paths and options. + The keys from output_paths, target_names and input_options will be avaialble via the __getitem__ function + + Example + ------- + lc = LamaConfig(Path('cfg_path')) + """ - def __init__(self, config_path: Path): + def __init__(self, config: Union[Path, Dict], cfg_path: Path=None): """ Parameters ---------- - config_path - pat to the lama config file + config + path to the lama config file + or + config dictionary + cfg_path + Used for testing. If we want to pass in a dict rather than a path we with also need a path of the project + directory, which is normally the cfg parent directory Raises ------ - OSError of subclasses thereof if config file cannot be opened + OSError or subclasses thereof if config file cannot be opened """ - - self.config_path = config_path - self.config = common.cfg_load(config_path) + if isinstance(config, dict): + if cfg_path is None: + raise ValueError("Please supply a project root path") + self.config = config + config_path = cfg_path + elif isinstance(config, Path): + self.config = common.cfg_load(config) + config_path = config + else: + raise ValueError("config must me a Path or Dict") + self.config_path = Path(config_path) # The variable names mapped to the actual names of output directories # If the value is a string, it will be created in the output_dir @@ -68,6 +87,7 @@ def __init__(self, config_path: Path): 'inverted_labels': 'inverted_labels', 'inverted_stats_masks': 'inverted_stats_masks', 'organ_vol_result_csv': common.ORGAN_VOLUME_CSV_FILE + }) # Options in the config that map to files that can be present in the target folder @@ -76,13 +96,13 @@ def __init__(self, config_path: Path): 'stats_mask', 'fixed_volume', 'label_map', - 'label_names' + 'label_info' ) self.input_options = { - - # parameter: [options...], default] - # Options can be types or functions + # Config parameters to be validated (non-elastix related parameters) + # parameter: ([options...], default) + # Options can be types to check against or functions that retrn True is value is valid 'global_elastix_params': ('dict', 'required'), 'registration_stage_params': ('dict', 'required'), 'no_qc': ('bool', False), @@ -97,7 +117,9 @@ def __init__(self, config_path: Path): 'staging': ('func', self.validate_staging), 'data_type': (['uint8', 'int8', 'int16', 'uint16', 'float32'], 'uint8'), 'glcm': ('bool', False), - 'config_version': ('float', 1.1) + 'config_version': ('float', 1.1), + 'stage_targets': (Path, False), + 'fix_folding': (bool, False) } # The paths to each stage output dir: stage_id: Path @@ -122,7 +144,7 @@ def __init__(self, config_path: Path): self.check_options() - # self.check_images() + self.check_images() self.resolve_output_paths() @@ -194,6 +216,7 @@ def check_options(self): validation[1]() continue # Option set in function + # Check for a list of options elif isinstance(checker, list): if value not in checker: @@ -229,8 +252,18 @@ def validate_staging(self): else: if st not in list(STAGING_METHODS.keys()): raise LamaConfigError('staging must be one of {}'.format(','.join(list(STAGING_METHODS.keys())))) - if st == 'embryo_volume' and not self.config.get('stats_mask'): - raise LamaConfigError("To calculate embryo volume a 'stats_mask' should be added to the config ") + + if st == 'embryo_volume': + if not self.config.get('stats_mask') or self.config.get('skip_transform_inversion'): + raise LamaConfigError("To calculate embryo volume the following options must be set\n" + "'stats_mask' which is tight mask use for statistical analysis and calcualting whoel embryo volume\n" + "'skip_transform_inversion' must be False the inversions are needed to calculate embryo volume") + + non_def_stages = [x for x in self.config['registration_stage_params'] if + x['elastix_parameters']['Transform'] in + ['SimilarityTransform', 'AffineTransform']] + if not non_def_stages: + raise LamaConfigError("In order to calculate embryo volume an affine or similarity stage is needed") self.options['staging'] = st @@ -314,6 +347,8 @@ def check_stages(self): def check_images(self): """ validate that image paths are correct and give loadeable volumes + + For now just check for spaces in filenames """ img_dir = self.options['inputs'] @@ -335,21 +370,27 @@ def check_images(self): for im_name in imgs: image_path = join(img_dir, im_name) - array_load = common.LoadImage(image_path) - if not array_load: - logging.error(array_load.error_msg) - raise FileNotFoundError(f'cannot load {image_path}') - - self.check_dtype(self.config, array_load.array, array_load.img_path) - self.check_16bit_elastix_parameters_set(self.config, array_load.array) - dtypes[im_name] = array_load.array.dtype - - if len(set(dtypes.values())) > 1: - dtype_str = "" - for k, v in list(dtypes.items()): - dtype_str += k + ':\t' + str(v) + '\n' - logging.warning('The input images have a mixture of data types\n{}'.format(dtype_str)) - + self.space_filename_check(image_path) + + # array_load = common.LoadImage(image_path) + # if not array_load: + # logging.error(array_load.error_msg) + # raise FileNotFoundError(f'cannot load {image_path}') + # + # self.check_dtype(self.config, array_load.array, array_load.img_path) + # self.check_16bit_elastix_parameters_set(self.config, array_load.array) + # dtypes[im_name] = array_load.array.dtype + # + # if len(set(dtypes.values())) > 1: + # dtype_str = "" + # for k, v in list(dtypes.items()): + # dtype_str += k + ':\t' + str(v) + '\n' + # logging.warning('The input images have a mixture of data types\n{}'.format(dtype_str)) + + def space_filename_check(self, name): + name = str(name) + if ' ' in name: + raise ValueError(f'{name} has spaces in it. \nFiles must not contain spaces') def check_16bit_elastix_parameters_set(self, config, array): if array.dtype in (np.int16, np.uint16): @@ -402,6 +443,7 @@ def check_paths(self): failed.append(target_path) else: self.options[tn] = target_path + self.space_filename_check(target_path) else: self.options[tn] = None @@ -428,6 +470,7 @@ def check_for_unknown_options(self): msg = "The following option is not recognised: {}\nDid you mean: {} ?".format(param, ", ".join(closest_matches)) logging.error(msg) + import lama raise LamaConfigError(msg) def pairwise_check(self): @@ -449,7 +492,9 @@ def convert_image_pyramid(self): """ The elastix image pyramid needs to be specified for each dimension for each resolution - This function allows it to specified for just one resolution as it should always be the same for each dimension. + This function allows it to specified for just one resolution as it should always be the same for each dimension + as we are assuming ispotropic data + . Convert to elastix required format Parameters @@ -458,6 +503,7 @@ def convert_image_pyramid(self): paramters """ + return # elastix allows pyramid schedule to be specified once per resolution. Maybe this was for an older version? for stage in self.config['registration_stage_params']: elx_params = stage['elastix_parameters'] if elx_params.get('ImagePyramidSchedule') and elx_params.get('NumberOfResolutions'): diff --git a/lama/scripts/lama_job_runner.py b/lama/scripts/lama_job_runner.py index 22986fd6..ccc23bc1 100755 --- a/lama/scripts/lama_job_runner.py +++ b/lama/scripts/lama_job_runner.py @@ -142,9 +142,9 @@ def lama_job_runner(config_path: Path, while True: try: + # Create a lock then read jobs and add status to job file to ensure job is run once only. with lock.acquire(timeout=60): - # Create a lock then read jobs and add status to job file to ensure job is run once only. df_jobs = pd.read_csv(job_file, index_col=0) # Get an unfinished job diff --git a/lama/scripts/lama_reg.py b/lama/scripts/lama_reg.py index 5c0237f3..bc06837d 100755 --- a/lama/scripts/lama_reg.py +++ b/lama/scripts/lama_reg.py @@ -12,8 +12,9 @@ def main(): setup tools console_scripts used to allow command lien running of scripts needs to have a function that takes zero argumnets. """ + from lama.elastix import invert_transforms parser = argparse.ArgumentParser("The LAMA registration pipeline") - parser.add_argument('-c', dest='config', help='Config file (YAML format)', required=True) + parser.add_argument('-c', dest='config', help='Config file (TOML format)', required=True) args = parser.parse_args() run_lama.run(Path(args.config)) diff --git a/lama/staging/affine_similarity_scaling_factors.py b/lama/staging/affine_similarity_scaling_factors.py index ca3e15c3..c82b19b0 100755 --- a/lama/staging/affine_similarity_scaling_factors.py +++ b/lama/staging/affine_similarity_scaling_factors.py @@ -3,10 +3,9 @@ import os from os.path import join, dirname -from pathlib import Path import numpy as np import sys -import argparse +# Todo remove this sys.path.insert(0, join(dirname(__file__), '..')) import lib.transformations as trans @@ -31,7 +30,7 @@ def get_scaling_factor(tform_params): def extract_affine_transformation_parameters(path): - scaling_factors = [] + for file_ in os.listdir(path): if not file_ == TFORM_FILE_NAME: continue diff --git a/lama/stats/common.py b/lama/stats/common.py new file mode 100644 index 00000000..9b87f739 --- /dev/null +++ b/lama/stats/common.py @@ -0,0 +1,12 @@ +from statistics import mean +from numpy import std, mean, sqrt + + +def cohens_d(x, y): + + nx = len(x) + ny = len(y) + dof = nx + ny - 2 + + # For testing + return (mean(x) - mean(y)) / sqrt(((nx - 1) * std(x, ddof=1) ** 2 + (ny - 1) * std(y, ddof=1) ** 2) / dof) \ No newline at end of file diff --git a/lama/stats/heatmap.py b/lama/stats/heatmap.py new file mode 100644 index 00000000..83dd4495 --- /dev/null +++ b/lama/stats/heatmap.py @@ -0,0 +1,57 @@ +""" +Seaborn 0.9.0 gives missing rows for some reason. So use matplotlib directly instead +""" + +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import seaborn as sns +from typing import Union +import matplotlib + +def heatmap(data: pd.DataFrame, title, use_sns=False): + # import matplotlib.pylab as pylab + # params = {'legend.fontsize': 'x-large', + # 'axes.labelsize': 'x-large', + # 'axes.titlesize': 'x-large', + # 'xtick.labelsize': 'x-large', + # 'ytick.labelsize': 'x-large'} + # pylab.rcParams.update(params) + + + + fig, ax = plt.subplots(figsize = [14,15]) + + if use_sns: + # sns.palplot(sns.color_palette("coolwarm")) + sns.heatmap(data, cmap=sns.color_palette("coolwarm", 100), ax=ax, cbar_kws={'label': 'mean volume ratio'}, + square=True) + + ax.figure.axes[-1].yaxis.label.set_size(22) + cbar = ax.collections[0].colorbar + cbar.ax.tick_params(labelsize=20) + else: + ax.imshow(data) + + xlabels = data.columns + ylabels = [x.replace('_', ' ') for x in data.index] + # We want to show all ticks... + # ax.set_xticks(np.arange(len(xlabels))) + # ax.set_yticks(np.arange(len(ylabels))) + # ... and label them with the respective list entries + ax.set_xticklabels(xlabels, rotation = 90, ha="center") + ax.set_yticklabels(ylabels) + + # Rotate the tick labels and set their alignment. + # plt.setp(ax.get_xticklabels(), rotation=90, ha='center', + # rotation_mode="default") + # ax.xaxis.set_tick_params(horizontalalignment='left') + # plt.xticks(np.arange(len(gamma_range)) + 0.5, gamma_range, rotation=45, ) + # plt.xticks(rotation=90) + # ax.tick_params(axis="x", direction="right", pad=-22) + plt.xticks(fontsize=12) + plt.yticks(fontsize=12) + + + # ax.set_title(title) + diff --git a/lama/stats/penetrence_expressivity_plots.py b/lama/stats/penetrence_expressivity_plots.py new file mode 100644 index 00000000..6af8a369 --- /dev/null +++ b/lama/stats/penetrence_expressivity_plots.py @@ -0,0 +1,138 @@ +""" +Gicen a folder of stats results (each line in a subfolder), create + heatmaps to summarize the hits across the specimens and at the line-level""" + +from pathlib import Path +from typing import Iterable +import pandas as pd +from lama.stats.heatmap import heatmap +import matplotlib.pyplot as plt +from logzero import logger as logging + + +def heatmaps_form_permutation_stats(root_dir: Path): + """ + This function works on the output of the premutation stats. For the non-permutation, may need to make a different + function to deal with different directory layout + """ + + for line_dir in root_dir.iterdir(): + + spec_dir = line_dir / 'specimen_level' + spec_csvs = [] + + for s_dir in spec_dir.iterdir(): + scsv = next(s_dir.iterdir()) + spec_csvs.append(scsv) + + try: + line_hits_csv = next(line_dir.glob(f'{line_dir.name}_organ_volumes*csv')) + except StopIteration: + logging.error(f'cannot find stats results file in {str(line_dir)}') + return + + line_specimen_hit_heatmap(line_hits_csv, spec_csvs, line_dir, line_dir.name) + + +def line_specimen_hit_heatmap(line_hits_csv: Path, + specimen_hits: Iterable[Path], + outdir: Path, + line: str, + sorter_csv=None): + + dfs = {} # All line and speciemn hit dfs + + line_hits = pd.read_csv(line_hits_csv, index_col=0) + dfs[line] = line_hits + + for spec_file in specimen_hits: + d = pd.read_csv(spec_file, index_col=0) + + # d = d[d['significant_cal_p'] == True] + + # small_id = get_specimen_id(hits_file.name) for now use full name + # dfs[small_id] = d + dfs[spec_file.name] = d + + # get the superset of all hit labels + hit_lables = set() + for k, x in dfs.items(): + hit_lables.update(x[x['significant_cal_p'] == True].label_name) + + # For each hit table, keep only those in the hit superset and create heat_df + t = [] + for line_or_spec, y in dfs.items(): + y = y[y['label_name'].isin(hit_lables)] + y['label_num']= y.index + y.set_index('label_name', inplace=True, drop=True) + + y.loc[y.significant_cal_p == False, 'mean_vol_ratio'] = None + + if 'mean_vol_ratio' in y: + col_for_heatmap = 'mean_vol_ratio' + else: + col_for_heatmap = 'significant_cal_p' + + # Rename the column we are to display to the name of the specimen + y.rename(columns={col_for_heatmap: line_or_spec}, inplace=True) + t.append(y[[line_or_spec]]) + + heat_df = pd.concat(t, axis=1) + + # if sorter_csv: + # # We have a csv with column abs(vol-diff) that we can use to sort results + # sorter = pd.read_csv(sorter_csv, index_col=0) + # s = 'abs(vol_diff)' + # heat_df = heat_df.merge(sorter[[s, 'label_name']], left_index=True, right_on='label_name', how='left') + # heat_df.sort_values(s, inplace=True, ascending=False) + # heat_df.set_index('label_name', drop=True, inplace=True) + # heat_df.drop(columns=[s], inplace=True) + + if len(heat_df) < 1: + print(f"skipping {line}. No hits") + return + + title = f'{line}\n Line 5% FDR. Specimen 20% FDR' + # Try to order lines by litter + ids = list(heat_df.columns) + line_id = ids.pop(0) + + ids.sort(key=lambda x: x[-2]) + sorted_ids = [line_id] + ids + heat_df = heat_df[sorted_ids] + + # Shorten some of the longer organ names + # label_meta = pd.read_csv('/mnt/bit_nfs/neil/Harwell_E14_5_latest/padded_target_ras/E14_5_atlas_v24_40_label_info_nouse.csv', index_col=0) + # label_meta.set_index('label_name', inplace=True, drop=True) + # heat_df = heat_df.merge(label_meta[['short_name']], how='left', left_index=True, right_index=True) + + def sn_mapper(x): + sn = heat_df.loc[x, 'short_name'] + if not isinstance(sn, float): # float nan + idx = sn + else: + idx = x + return idx.lower() + + # heat_df.index = heat_df.index.map(sn_mapper) + # heat_df.drop(columns='short_name', inplace=True) + + heatmap(heat_df, title=title, use_sns=True) + + plt.tight_layout() + + plt.savefig(outdir / f"{line}_organ_hit_heatmap.png") + plt.close() + + +if __name__ == '__main__': + spec_dir = Path('/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/JAX_E15_5_test_120720/stats/organ_vol_perm_091020/lines/Cox7c/specimen_level') + spec_csvs = [] + + for s_dir in spec_dir.iterdir(): + scsv = next(s_dir.iterdir()) + spec_csvs.append(scsv) + line_specimen_hit_heatmap(Path('/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/JAX_E15_5_test_120720/stats/organ_vol_perm_091020/lines/Cox7c/Cox7c_organ_volumes_2020-10-09.csv'), + spec_csvs, + Path('/mnt/bit_nfs/neil/impc_e15_5/phenotyping_tests/JAX_E15_5_test_120720/stats/organ_vol_perm_091020/lines/Cox7c'), + 'Cox7c') \ No newline at end of file diff --git a/lama/stats/permutation_stats/distributions.py b/lama/stats/permutation_stats/distributions.py index 52121c8d..5d6c18b2 100644 --- a/lama/stats/permutation_stats/distributions.py +++ b/lama/stats/permutation_stats/distributions.py @@ -89,8 +89,8 @@ def null(input_data: pd.DataFrame, # once # TODO: Why does the LM not try to return a specimen-level p value as well? for index, _ in info.iterrows(): - info['genotype'] = 'wt' # Set all genotypes to WT - info.ix[[index], 'genotype'] = 'synth_hom' # Set the ith baseline to synth hom + info.loc[:, 'genotype'] = 'wt' # Set all genotypes to WT + info.loc[[index], 'genotype'] = 'synth_hom' # Set the ith baseline to synth hom # Get a p-value for each organ p, t = lm_r(data, info) @@ -166,7 +166,7 @@ def _label_synthetic_mutants(info: pd.DataFrame, n: int, sets_done: List) -> boo """ # Set all to wt genotype - info['genotype'] = 'wt' + info.loc[:, 'genotype'] = 'wt' # label n number of baselines as mutants @@ -187,7 +187,8 @@ def _label_synthetic_mutants(info: pd.DataFrame, n: int, sets_done: List) -> boo sets_done.append(set(synthetics_mut_indices)) - info.ix[synthetics_mut_indices, 'genotype'] = 'synth_hom' # Why does iloc not work here? + # info.ix[synthetics_mut_indices, 'genotype'] = 'synth_hom' # Why does iloc not work here? + info.loc[info.index[synthetics_mut_indices], 'genotype'] = 'synth_hom' return True @@ -198,7 +199,7 @@ def strip_x(dfs): def alternative(input_data: pd.DataFrame, plot_dir: Union[None, Path] = None, - boxcox: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame]: + boxcox: bool = False) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Generate alterntive (mutant) distributions for line and pecimen-level data @@ -210,7 +211,11 @@ def alternative(input_data: pd.DataFrame, Returns ------- - alternative distribution dataframes with either line or specimen as index + alternative distribution dataframes with either line or specimen as index. + 0: line-level p values + 1: specimen-level p values + 2: line-level t-values + 3: specimen-level t-values """ # Group by line and sequntaily run @@ -219,10 +224,12 @@ def alternative(input_data: pd.DataFrame, label_names = list(input_data.drop(['staging', 'line'], axis='columns').columns) baseline = input_data[input_data['line'] == 'baseline'] - baseline['genotype'] = 'wt' + baseline.loc[:, 'genotype'] = 'wt' alt_line_pvalues = [] alt_spec_pvalues = [] + alt_line_t = [] + alt_spec_t = [] # Get line-level alternative distributions for line_id, line_df in line_groupby: @@ -230,7 +237,7 @@ def alternative(input_data: pd.DataFrame, if line_id == 'baseline': continue - line_df['genotype'] = 'hom' + line_df.loc[:, 'genotype'] = 'hom' line_df.drop(['line'], axis=1) # ? df_wt_mut = pd.concat([baseline, line_df]) @@ -246,6 +253,9 @@ def alternative(input_data: pd.DataFrame, res = [line_id] + list(p) alt_line_pvalues.append(res) + res_t = [line_id] + list(t) + alt_line_t.append(res_t) + # Get specimen-level alternative distributions mutants = input_data[input_data['line'] != 'baseline'] # baselines = input_data[input_data['line'] == 'baseline'] @@ -260,10 +270,18 @@ def alternative(input_data: pd.DataFrame, res = [line_id, specimen_id] + list(p) alt_spec_pvalues.append(res) + res_t = [specimen_id] + list(t) + alt_spec_t.append(res_t) + # result dataframes have either line or specimen in index then labels alt_line_df = pd.DataFrame.from_records(alt_line_pvalues, columns=['line'] + label_names, index='line') - alt_spec_df = pd.DataFrame.from_records(alt_spec_pvalues, columns=['line', 'specimen'] + label_names, index='specimen') + alt_spec_df = pd.DataFrame.from_records(alt_spec_pvalues, columns=['line', 'specimen'] + label_names, + index='specimen') + + alt_line_t_df = pd.DataFrame.from_records(alt_line_t, columns=['line'] + label_names, index='line') + alt_spec_t_df = pd.DataFrame.from_records(alt_spec_t, columns=['specimen'] + label_names, + index='specimen') - strip_x([alt_line_df, alt_spec_df]) + strip_x([alt_line_df, alt_spec_df, alt_line_t_df, alt_spec_t_df]) - return alt_line_df, alt_spec_df + return alt_line_df, alt_spec_df, alt_line_t_df, alt_spec_t_df diff --git a/lama/stats/permutation_stats/run_permutation_stats.py b/lama/stats/permutation_stats/run_permutation_stats.py index 7b2dbd91..4f78fb08 100644 --- a/lama/stats/permutation_stats/run_permutation_stats.py +++ b/lama/stats/permutation_stats/run_permutation_stats.py @@ -44,6 +44,7 @@ import pandas as pd import numpy as np +from scipy.stats import zmap from logzero import logger as logging import yaml @@ -53,11 +54,26 @@ from lama.paths import specimen_iterator from lama.qc.organ_vol_plots import make_plots, pvalue_dist_plots from lama.common import write_array, read_array, init_logging - - +from lama.stats.common import cohens_d +from lama.stats.penetrence_expressivity_plots import heatmaps_form_permutation_stats GENOTYPE_P_COL_NAME = 'genotype_effect_p_value' PERM_SIGNIFICANT_COL_NAME = 'significant_cal_p' +PERM_T_COL_NAME = 't' + + +def write_specimen_info(wt_wev, mut_wev, outfile, sd=2.0): + """ + Write a csv with some summary info on specimens + currently only returns Z-score of mutants + """ + def sortwev(x): + print(x) + return x + wev_z = zmap(mut_wev.staging, wt_wev.staging) + mut_wev['WEV_zscore'] = wev_z + mut_wev.sort_values('WEV_zscore', key=sortwev, inplace=True) + mut_wev.to_csv(outfile) def get_organ_volume_data(root_dir: Path) -> pd.DataFrame: @@ -139,9 +155,16 @@ def get_staging_data(root_dir: Path) -> pd.DataFrame: return all_staging -def annotate(thresholds: pd.DataFrame, lm_results: pd.DataFrame, lines_root_dir: Path, line_level: bool = True, - label_info: Path = None, label_map: Path = None, write_thresholded_inv_labels=False, - fdr_threshold: float=0.05): +def annotate(thresholds: pd.DataFrame, + lm_results: pd.DataFrame, + lines_root_dir: Path, + line_level: bool = True, + label_info: Path = None, + label_map: Path = None, + write_thresholded_inv_labels=False, + fdr_threshold: float=0.05, + t_values: pd.DataFrame=None, + organ_volumes: pd.DataFrame=None): """ Using the p_value thresholds and the linear model p-value results, create the following CSV files @@ -157,16 +180,20 @@ def annotate(thresholds: pd.DataFrame, lm_results: pd.DataFrame, lines_root_dir: The alternative distribution index: line/specimen id cols: labels (+ line_id for specimen_level) - outdir - The root directory to save the annotated CSV files + lines_root_dir + The root directory to save the annotated CSV files. Each line to go in a subfolder line_level if not True, place results in specimen-level sub directory label_info CSV to map label number to name + t_values + same format as lm_results but with t-statistics + organ_volumes + All the organ volumes for baselines and mutants (as it was used in lm(), so probably normalised to whole embryo + Notes ----- - Today's date added to the stats output folder in case it's run multiple times, TODO: Add file number prefixes so we don't overwrite mulyiple analyses done on the same day TODO: the organ_volumes folder name is hard-coded. What about if we add a new analysis type to the permutation stats pipeline? """ @@ -175,13 +202,14 @@ def annotate(thresholds: pd.DataFrame, lm_results: pd.DataFrame, lines_root_dir: if label_map: label_map = read_array(label_map) + # Iterate over each line or specimen (for line or specimen-level analysis) for id_, row in lm_results.iterrows(): - # Create a dataframe containing p-value column. each organ on rows + # Create a dataframe containing a p-value column. each row an organ df = row.to_frame() if not line_level: - # specimne-level has an extra line column we need to remove + # specimen-level has an extra line column we need to remove df = df.T.drop(columns=['line']).T # Rename the line_specimen column to be more informative @@ -192,11 +220,44 @@ def annotate(thresholds: pd.DataFrame, lm_results: pd.DataFrame, lines_root_dir: else: line = row['line'] - # Merge the permutation results (p-thresh, fdr, number of hit lines fo this label) with the mutant results + # Merge the permutation results (p-thresh, fdr, number of hit lines for this label) with the mutant results df.index = df.index.astype(np.int64) # Index needs to be cast from object to enable merge df = df.merge(thresholds, left_index=True, right_index=True, validate='1:1') df.index.name = 'label' + # Merge the t-statistics + if t_values is not None: + t_df = pd.DataFrame(t_values.loc[id_]) + + if t_df.shape[1] != 1: # We get multiple columns f there are duplicate specimen/lne ids + raise ValueError("Dupliacte specimen names not allowed") + + t_df.columns = ['t'] + t_df.drop(columns=['line'], errors='ignore', inplace=True) # this is for speciem-level results + + t_df.index = t_df.index.astype(np.int64) + + df = df.merge(t_df, left_index=True, right_index=True, validate='1:1') + if len(df) < 1: + logging.info(f'skipping {id_} no hits') # Should we continue at this point? + + # Add mean organ vol difference and cohens d + df['mean_vol_ratio'] = None + if line_level: + df['cohens_d'] = None + + for label, row in df.iterrows(): + # Organ vols are prefixed with x so it can work with statsmodels + label_col = f'x{label}' + label_organ_vol = organ_volumes[[label_col, 'line']] + wt_ovs = label_organ_vol[label_organ_vol.line == 'baseline'][f'x{label}'] + mut_ovs = label_organ_vol[label_organ_vol.line == line][f'x{label}'] + + df.loc[label, 'mean_vol_ratio'] = mut_ovs.mean() / wt_ovs.mean() + if line_level: + cd = cohens_d(mut_ovs,wt_ovs) + df.loc[label, 'cohens_d'] = cd + output_name = f'{id_}_organ_volumes_{str(date.today())}.csv' line_output_dir = lines_root_dir / line @@ -355,7 +416,7 @@ def run(wt_dir: Path, mut_dir: Path, out_dir: Path, num_perms: int, np.random.seed(999) init_logging(out_dir / 'stats.log') logging.info(common.git_log()) - logging.info(f'Running {__name__} with followng commands\n{common.command_line_agrs()}') + logging.info(f'Running {__name__} with following commands\n{common.command_line_agrs()}') wt_staging = get_staging_data(wt_dir) mut_staging = get_staging_data(mut_dir) @@ -395,8 +456,8 @@ def run(wt_dir: Path, mut_dir: Path, out_dir: Path, num_perms: int, line_null.to_csv(null_line_pvals_file) specimen_null.to_csv(null_specimen_pvals_file) - # Get the alternative distribution - line_alt, spec_alt = distributions.alternative(data) + # Get the alternative p-value distribution (and t-values now (2 and 3) + line_alt, spec_alt, line_alt_t, spec_alt_t = distributions.alternative(data) line_alt_pvals_file = dists_out / 'alt_line_dist_pvalues.csv' spec_alt_pvals_file = dists_out / 'alt_specimen_dist_pvalues.csv' @@ -422,16 +483,24 @@ def run(wt_dir: Path, mut_dir: Path, out_dir: Path, num_perms: int, # Annotate lines logging.info(f"Annotating lines, using a FDR threshold of {line_fdr}") annotate(line_organ_thresholds, line_alt, lines_root_dir, label_info=label_info, - label_map=label_map_path, write_thresholded_inv_labels=True,fdr_threshold=line_fdr) + label_map=label_map_path, write_thresholded_inv_labels=True,fdr_threshold=line_fdr, t_values=line_alt_t, + organ_volumes=data) # Annotate specimens logging.info(f"Annotating specimens, using a FDR threshold of {specimen_fdr}") annotate(specimen_organ_thresholds, spec_alt, lines_root_dir, line_level=False, - label_info=label_info, label_map=label_map_path, fdr_threshold=specimen_fdr) + label_info=label_info, label_map=label_map_path, fdr_threshold=specimen_fdr, t_values=spec_alt_t, + organ_volumes=data) + # Make plots mut_dir_ = mut_dir / 'output' make_plots(mut_dir_, raw_wt_vols, wt_staging, label_info, lines_root_dir) + + # Get specimen info. Currently just the WEV z-score to highlight specimens that are too small/large + spec_info_file = out_dir / 'specimen_info.csv' + write_specimen_info(wt_staging, mut_staging, spec_info_file) + dist_plot_root = out_dir / 'distribution_plots' line_plot_dir = dist_plot_root / 'line_level' line_plot_dir.mkdir(parents=True, exist_ok=True) @@ -441,6 +510,10 @@ def run(wt_dir: Path, mut_dir: Path, out_dir: Path, num_perms: int, specimen_plot_dir.mkdir(parents=True, exist_ok=True) pvalue_dist_plots(specimen_null, spec_alt.drop(columns=['line']), specimen_organ_thresholds, specimen_plot_dir) + heatmaps_form_permutation_stats(lines_root_dir) + + + diff --git a/lama/stats/standard_stats/data_loaders.py b/lama/stats/standard_stats/data_loaders.py index da3ad9fb..a13be204 100644 --- a/lama/stats/standard_stats/data_loaders.py +++ b/lama/stats/standard_stats/data_loaders.py @@ -34,6 +34,10 @@ from lama.img_processing.misc import blur from lama.paths import specimen_iterator +import os +import gc +import sys +import psutil GLCM_FILE_SUFFIX = '.npz' DEFAULT_FWHM = 100 # um @@ -91,6 +95,8 @@ def __init__(self, self.size = np.prod(shape) self.mask = mask + logging.info(f"Create line data '{self.line}'") + if len(data) != len(info): raise ValueError @@ -127,23 +133,14 @@ def get_num_chunks(self, log: bool=False): overhead_factor = 100 # debug bytes_free = common.available_memory() - num_samples = len(self.data) + procMemRSS_bytes = psutil.Process(os.getpid()).memory_info().rss - try: - self.data.shape - except AttributeError: # List of numpy arrays - dtype_size = self.data[0].dtype.itemsize - num_voxels = self.data[0].size - else: # Dataframes - num_voxels = self.data.shape[1] - dtype_size = self.data.values.dtype.itemsize + num_chunks = math.ceil((procMemRSS_bytes * overhead_factor) / bytes_free) - data_size = dtype_size * num_samples * num_voxels + if log: + logging.info(f'\nAvailable memory: {common.bytesToGb(bytes_free)} GB\nProcess memory: {common.bytesToGb(procMemRSS_bytes)} GB') - num_chunks = math.ceil((data_size * overhead_factor) / bytes_free) - if log: - logging.info(f'\nAvailable memory: {round(bytes_free / (1024 **3), 3)} GB\nSize of raw data: {round(data_size / (1024**3), 3)} GB') if num_chunks > 1: return num_chunks @@ -191,6 +188,35 @@ def chunks(self) -> Iterator[np.ndarray]: def mask_size(self) -> int: return self.mask[self.mask == 1].size + def cleanup(self): + logging.info(f"Cleanup line data '{self.line}', object size {sys.getsizeof(self)} byte.") + + if self.data is not None: + self.data = None + + if self.shape is not None: + self.shape = None + + if self.info is not None: + self.info = None + + if self.paths is not None: + self.paths = None + + if self.outdirs is not None: + self.outdirs = None + + if self.size is not None: + self.size = None + + if self.mask is not None: + self.mask = None + + gc.collect() + + def __del__(self): + logging.info(f"Finalize line data '{self.line}'") + class DataLoader: """ @@ -644,7 +670,7 @@ def line_iterator(self) -> LineData: except KeyError: pass - if self.norm_to_mask_volume_on: + if self.norm_to_mask_volume_on: # Is on by default logging.info('normalising organ volume to whole embryo volumes') data = data.div(staging['staging'], axis=0) input_ = LineData(data, staging, line, self.shape, ([self.wt_dir], [self.mut_dir])) diff --git a/lama/stats/standard_stats/lama_stats_new.py b/lama/stats/standard_stats/lama_stats_new.py index ca57e83d..96bdecf2 100644 --- a/lama/stats/standard_stats/lama_stats_new.py +++ b/lama/stats/standard_stats/lama_stats_new.py @@ -16,14 +16,17 @@ from logzero import logger as logging import logzero +import gc + from lama.common import cfg_load -from lama.stats.standard_stats.stats_objects import Stats +from lama.stats.standard_stats.stats_objects import Stats, OrganVolume from lama.stats.standard_stats.data_loaders import DataLoader, load_mask, LineData from lama.stats.standard_stats.results_writer import ResultsWriter from lama import common from lama.stats import linear_model from lama.elastix.invert_volumes import InvertHeatmap from lama.img_processing.normalise import Normaliser +from lama.qc import organ_vol_plots def run(config_path: Path, @@ -81,7 +84,7 @@ def run(config_path: Path, label_map_file = target_dir / stats_config.get('label_map') label_map = common.LoadImage(label_map_file).array - memmap = stats_config.get('label_map') + memmap = stats_config.get('memmap') if memmap: logging.info('Memory mapping input data') @@ -97,58 +100,91 @@ def run(config_path: Path, for stats_type in stats_config['stats_types']: logzero.logfile(str(master_log_file)) - logging.info(f"Doing {stats_type} analysis") + logging.info(f"---Doing {stats_type} analysis---") + + gc.collect() + # load the required stats object and data loader loader_class = DataLoader.factory(stats_type) loader = loader_class(wt_dir, mut_dir, mask, stats_config, label_info_file, lines_to_process=lines_to_process, baseline_file=baseline_file, mutant_file=mutant_file, memmap=memmap) - if stats_config.get('normalise_organ_vol_to_mask') and hasattr(loader, 'norm_organ_vols_to_mask'): - loader.norm_organ_vols_to_mask() + # Only affects organ vol loader. + if not stats_config.get('normalise_organ_vol_to_mask'): + loader.norm_to_mask_volume_on = False # Currently only the intensity stats get normalised loader.normaliser = Normaliser.factory(stats_config.get('normalise'), stats_type) # move this into subclass - for line_input_data in loader.line_iterator(): # NOTE: This might be where we could parallelize - - line_id = line_input_data.line - - line_stats_out_dir = out_dir / line_id / stats_type - - line_stats_out_dir.mkdir(parents=True, exist_ok=True) - line_log_file = line_stats_out_dir / f'{common.date_dhm()}_stats.log' - logzero.logfile(str(line_log_file)) - - logging.info(f"Processing line: {line_id}") - - stats_class = Stats.factory(stats_type) - stats_obj = stats_class(line_input_data, stats_type, stats_config.get('use_staging', True)) - - stats_obj.stats_runner = linear_model.lm_r - stats_obj.run_stats() - - logging.info('statistical analysis finished. Writing results.') - - rw = ResultsWriter.factory(stats_type) - writer = rw(stats_obj, mask, line_stats_out_dir, stats_type, label_map, label_info_file) - - # - # if stats_type == 'organ_volumes': - # c_data = {spec: data['t'] for spec, data in stats_obj.specimen_results.items()} - # c_df = pd.DataFrame.from_dict(c_data) - # # cluster_plots.tsne_on_raw_data(c_df, line_stats_out_dir) - - if stats_config.get('invert_stats'): - if writer.line_heatmap: # Organ vols wil not have this - # How do I now sensibily get the path to the invert.yaml - # get the invert_configs for each specimen in the line - logging.info('Propogating the heatmaps back onto the input images ') - line_heatmap = writer.line_heatmap - line_reg_dir = mut_dir / 'output' / line_id - invert_heatmaps(line_heatmap, line_stats_out_dir, line_reg_dir, line_input_data) + logging.info("Start iterate through lines") + common.logMemoryUsageInfo() + + line_iterator = loader.line_iterator() + line_input_data = None + + while True: + try: + line_input_data = next(line_iterator) + logging.info(f"Data for line {line_input_data.line} loaded") + common.logMemoryUsageInfo() + + + line_id = line_input_data.line + + line_stats_out_dir = out_dir / line_id / stats_type + + line_stats_out_dir.mkdir(parents=True, exist_ok=True) + line_log_file = line_stats_out_dir / f'{common.date_dhm()}_stats.log' + logzero.logfile(str(line_log_file)) + + logging.info(f"Processing line: {line_id}") + + stats_class = Stats.factory(stats_type) + stats_obj = stats_class(line_input_data, stats_type, stats_config.get('use_staging', True)) + + stats_obj.stats_runner = linear_model.lm_r + stats_obj.run_stats() + + logging.info('Statistical analysis finished.') + common.logMemoryUsageInfo() + + logging.info('Writing results...') + + rw = ResultsWriter.factory(stats_type) + writer = rw(stats_obj, mask, line_stats_out_dir, stats_type, label_map, label_info_file) + + logging.info('Finished writing results.') + common.logMemoryUsageInfo() + # + # if stats_type == 'organ_volumes': + # c_data = {spec: data['t'] for spec, data in stats_obj.specimen_results.items()} + # c_df = pd.DataFrame.from_dict(c_data) + # # cluster_plots.tsne_on_raw_data(c_df, line_stats_out_dir) + + + if stats_config.get('invert_stats'): + if writer.line_heatmap: # Organ vols wil not have this + # How do I now sensibily get the path to the invert.yaml + # get the invert_configs for each specimen in the line + logging.info('Writing heatmaps...') + logging.info('Propogating the heatmaps back onto the input images ') + line_heatmap = writer.line_heatmap + line_reg_dir = mut_dir / 'output' / line_id + invert_heatmaps(line_heatmap, line_stats_out_dir, line_reg_dir, line_input_data) + logging.info('Finished writing heatmaps.') + + logging.info(f"Finished processing line: {line_id} - All done") + common.logMemoryUsageInfo() + + except StopIteration: + if (line_input_data != None): + logging.info(f"Finish iterate through lines") + line_input_data.cleanup() + common.logMemoryUsageInfo() + break; + - logging.info('All done') def invert_heatmaps(heatmap: Path, diff --git a/lama/stats/standard_stats/results_writer.py b/lama/stats/standard_stats/results_writer.py index 6aac36d7..96d7d373 100644 --- a/lama/stats/standard_stats/results_writer.py +++ b/lama/stats/standard_stats/results_writer.py @@ -73,15 +73,20 @@ def __init__(self, line_qvals = results.line_qvals line_pvals = results.line_pvalues # Need to get thse into organ volumes + logging.info('Writing line-level results...') + line_threshold_file = self.out_dir / f'Qvals_{stats_name}_{self.line}.csv' write_threshold_file(line_qvals, line_tstats, line_threshold_file) # this is for the lama_stats to know where the heatmaps for inversion are self.line_heatmap = self._write(line_tstats, line_pvals, line_qvals, self.out_dir, self.line) # Bodge. Change! # 140620: I think this is where it's dying - print('#finished Writing line results') # pvalue_fdr_plot(results.line_pvalues, results.line_qvals, out_dir) + logging.info('Finished writing line-level results') + + logging.info('Writing specimen-level results...') + specimen_out_dir = out_dir / 'specimen-level' specimen_out_dir.mkdir(exist_ok=True) @@ -95,6 +100,8 @@ def __init__(self, self._write(spec_t, spec_p, spec_q, specimen_out_dir, spec_id) # self.log(self.out_dir, 'Organ_volume stats', results.input_) + logging.info('Finished writing specimen-level results') + @staticmethod def factory(data_type): diff --git a/lama/tests/README.md b/lama/tests/README.md index cea34843..832f45e4 100644 --- a/lama/tests/README.md +++ b/lama/tests/README.md @@ -26,3 +26,4 @@ lama_phenotype_detection/tests/run_tests.sh ### TODO * Make more tests * Make tests that query the output data to ensure everything has been procesed correctly +* Make the permutation tests output to main stats folder diff --git a/lama/tests/__init__.py b/lama/tests/__init__.py index bebdb522..8bafd459 100644 --- a/lama/tests/__init__.py +++ b/lama/tests/__init__.py @@ -14,6 +14,7 @@ wt_registration_dir = registration_root / 'baseline' mut_registration_dir = registration_root / 'mutant' target_dir = registration_root / 'target' +target_img = target_dir / '301015_deformable_to_8_rescaled_8bit.nrrd' stats_test_data_dir = test_data_root / 'stats_test_data' stats_output_dir = stats_test_data_dir / 'test_output' diff --git a/lama/tests/archive/test_run_permutation_stats.py b/lama/tests/archive/test_run_permutation_stats.py index d7d93d80..cb5abfeb 100644 --- a/lama/tests/archive/test_run_permutation_stats.py +++ b/lama/tests/archive/test_run_permutation_stats.py @@ -4,7 +4,7 @@ Usage ----- -These functions test the lama registration pipeline +These functions test the lama registration pipeline permutation stats module Usage: pytest -v -m "not notest" test_run_permutation_stats.py """ @@ -21,17 +21,15 @@ from lama.stats.permutation_stats import p_thresholds import pandas as pd # Import from __init__ -from . import test_data_root, registration_root, wt_registration_dir, mut_registration_dir, target_dir +from lama.tests.archive import test_data_root, registration_root, wt_registration_dir, mut_registration_dir, target_dir from lama.common import ORGAN_VOLUME_CSV_FILE, STAGING_INFO_FILENAME outdir = test_data_root / 'test_output' -@pytest.mark.notest -def test_prepare_data(): +def test_prepare_data(n_baselines=100, n_mutant_lines=20, n_labels=2): """ - Prepare data takes in a bunch of csv files containing organ volumes, staging information and label meta data - It returns a combined pd.DataFrame that is then used for the permutation testing. + Generates fake organ volume data fo use in testing the permutation stats pipeline The returned data frame has the followng columns index: specimen id @@ -42,7 +40,7 @@ def test_prepare_data(): This test tests that the ouput is correct. """ - + wt_organ_vol = pd.read_csv(wt_registration_dir / 'output' / ORGAN_VOLUME_CSV_FILE, index_col=0) wt_staging = pd.read_csv(wt_registration_dir / 'output' / STAGING_INFO_FILENAME, index_col=0) mut_organ_vol = pd.read_csv(mut_registration_dir / 'output' / ORGAN_VOLUME_CSV_FILE, index_col=0) diff --git a/lama/tests/test_config_errors.py b/lama/tests/test_config_errors.py new file mode 100644 index 00000000..b24dbcad --- /dev/null +++ b/lama/tests/test_config_errors.py @@ -0,0 +1,59 @@ +""" +Test the steps needed to generate wild type and mutant data for use in the statistical analysis + +Usage: pytest -v -m "not notest" test_data_generation.py +The use of -m "not notest" is to be able to omit certain tests with the @pytest.mark.notest decorator +""" + +from pathlib import Path +from lama.registration_pipeline import run_lama, validate_config +from lama.common import cfg_load + +import os +import shutil +import pytest + +from lama.scripts import lama_job_runner +from lama.tests import (registration_root, mut_registration_dir, wt_registration_dir) + +@pytest.mark.notest +@pytest.fixture +def delete_previous_files(): + """ + Remove the output generated from previous tests. This does not occur directly after the test as we may want to + look at the results. + """ + def delete(root: Path): + shutil.rmtree(root / 'output', ignore_errors=True) + for p in root.iterdir(): + if str(p).endswith(('.log', 'jobs.csv', 'csv.lock', '.yaml')): + p.unlink() + + delete(wt_registration_dir) + delete(mut_registration_dir) + + + +def test_config_errors(): + """ + Read in the current config that shuld work + + """ + + # config_file = registration_root / 'registration_config.toml' + config_file = registration_root / 'registration_config.toml' + config = cfg_load(config_file) + + # Staging = embryo_volume needs at least one similarity/affine stage to work + # for i, stage in enumerate(config['registration_stage_params']): + # if stage['elastix_parameters']['Transform'] in ['EulerTransform', 'AffineTransform']: + # del(config['registration_stage_params'][i]) + + config['registration_stage_params'][:] = [x for x in config['registration_stage_params'] if + x['elastix_parameters']['Transform'] not in ['SimilarityTransform', 'AffineTransform']] + + cfg = validate_config.LamaConfig(config, config_file) + + # + # assert lama_job_runner.lama_job_runner(config_file, wt_registration_dir) is True + # assert lama_job_runner.lama_job_runner(config_file, mut_registration_dir) is True diff --git a/lama/tests/test_data_generation.py b/lama/tests/test_data_generation.py index cff1bfd2..9f3dd575 100644 --- a/lama/tests/test_data_generation.py +++ b/lama/tests/test_data_generation.py @@ -12,8 +12,8 @@ import shutil import pytest -from scripts import lama_job_runner -from . import (registration_root, mut_registration_dir, wt_registration_dir) +from lama.scripts import lama_job_runner +from lama.tests import (registration_root, mut_registration_dir, wt_registration_dir) @pytest.fixture @@ -50,7 +50,7 @@ def test_lama_job_runner(): """ + # config_file = registration_root / 'registration_config.toml' config_file = registration_root / 'registration_config.toml' - assert lama_job_runner.lama_job_runner(config_file, wt_registration_dir) is True assert lama_job_runner.lama_job_runner(config_file, mut_registration_dir) is True diff --git a/lama/tests/test_population_average.py b/lama/tests/test_population_average.py index 029d5980..3ce6f1be 100755 --- a/lama/tests/test_population_average.py +++ b/lama/tests/test_population_average.py @@ -11,11 +11,13 @@ from pathlib import Path from lama.registration_pipeline import run_lama +import pytest + # Import paths from __init__.py from . import (population_test_dir) - +@pytest.mark.skip def test_population_average(): """ lama has ony one arg, the config file. Loop over all the configs to test and @@ -25,3 +27,8 @@ def test_population_average(): config_file = Path(population_test_dir)/ 'population_average_config.toml' run_lama.run(config_file) + + +def test_pairwie_average(): + config_file = Path(population_test_dir)/ 'pairwise_population_average_config.toml' + run_lama.run(config_file) \ No newline at end of file diff --git a/lama/tests/test_qc_images.py b/lama/tests/test_qc_images.py new file mode 100644 index 00000000..4b807dfa --- /dev/null +++ b/lama/tests/test_qc_images.py @@ -0,0 +1,23 @@ +""" +Functions to test QC image creation. +Manual checking of the output is required to make sure the images are as expected. + + +Notes +----- + These tests require test_data_generation.py to have been run previously so it has a data folder to work on + The test data is very small, but it should give an idea of whether the QC images are being made correclty + I may add full size images in the future. +""" + +import pytest +from . import wt_registration_dir, target_img +from lama.qc import qc_images + +# A lama specimen directory created when running test_data_generation.py +lama_specimen_dir = wt_registration_dir / 'output' / 'baseline' / '20140122_SCN4A_18.1_e_wt_rec_scaled_3.1241_pixel_14' +outdir = lama_specimen_dir / 'output' / 'qc' + +def test_qc_images(): + qc_images.make_qc_images(lama_specimen_dir, target_img, outdir) + print('p') \ No newline at end of file diff --git a/lama/tests/test_run_permutation_stats.py b/lama/tests/test_run_permutation_stats.py new file mode 100644 index 00000000..da385639 --- /dev/null +++ b/lama/tests/test_run_permutation_stats.py @@ -0,0 +1,93 @@ +""" +Test the permutation-based stats pipeline + +Usage +----- + +These functions test the lama registration pipeline permutation stats module + +Usage: pytest -v -m "not notest" test_run_permutation_stats.py +""" + + +import sys +from pathlib import Path +import numpy as np +from pathlib import Path +import pytest + +# from nose.tools import assert_raises, eq_, ok_, nottest +from lama.stats.permutation_stats import run_permutation_stats +from lama.stats.permutation_stats import p_thresholds +import pandas as pd +# Import from __init__ +from lama.tests import test_data_root, registration_root, wt_registration_dir, mut_registration_dir, target_dir +from lama.common import ORGAN_VOLUME_CSV_FILE, STAGING_INFO_FILENAME + + +outdir = test_data_root / 'test_output' + + + +def test_permutation_stats(): + """ + Run the whole permutation based stats pipeline. + Currently this just checks to see if the pipeline completes without any errors. + """ + num_perms = 5 # Would do 1000 or more normally + label_info = registration_root / 'target' / 'label_info.csv' + label_map = registration_root / 'target' / 'labels.nrrd' + perm_out_dir = outdir / 'organ_vols_permutation' # Intermediate results go here. Permutation distributions etc. + + run_permutation_stats.run(wt_registration_dir, mut_registration_dir, perm_out_dir, num_perms, + label_info=label_info, label_map_path=label_map) + + +@pytest.mark.notest +def test_p_thresholds(): + """ + Testing the p_thresholds calculation + ------- + TODO: Add more tests for different cases + """ + import copy + import pandas as pd + + # These simulate null distributions from 100 baselines for two organs + null = pd.DataFrame.from_dict({ + '1': np.linspace(0.01, 1, 100), + '2': np.linspace(0.01, 1, 100)}) + + # These simulate alternative distributions for two organs from 40 lines + alt = pd.DataFrame.from_dict({ + '1': np.linspace(0.00000001, 0.1, 40), + '2': np.linspace(0.9, 1, 40)}) + + thresh = p_thresholds.get_thresholds(null, alt) + + assert thresh.loc[1, 'p_thresh'] == 0.02 # Gives a p-value threshold of 0.02 + assert thresh.loc[2, 'p_thresh'] == 1.0 # Gives a p-value threshold of 1.0 as there are no low p-values in the alt distribution + + +# @pytest.mark.notest +# def test_annotate(): +# # Lines +# alt_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/alt_line_dist_pvalues.csv') +# thresholds_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/line_organ_p_thresholds.csv') +# mutant_dir = Path('/home/neil/git/lama/tests/test_data/registration_test_data/mutant') +# +# thresholds = pd.read_csv(thresholds_file, index_col=0) +# alt = pd.read_csv(alt_file, index_col=0) +# +# run_permutation_stats.annotate(thresholds, alt, mutant_dir) +# +# # # Specimens +# alt_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/alt_specimen_dist_pvalues.csv') +# thresholds_file = Path('/home/neil/git/lama/tests/test_data/stats_test_data/test_output/organ_vols_permutation/specimen_organ_p_thresholds.csv') +# mutant_dir = Path('/home/neil/git/lama/tests/test_data/registration_test_data/mutant') +# +# thresholds = pd.read_csv(thresholds_file, index_col=0) +# alt = pd.read_csv(alt_file, index_col=0) +# +# run_permutation_stats.annotate(thresholds, alt, mutant_dir) + diff --git a/lama/tests/test_standard_stats.py b/lama/tests/test_standard_stats.py index 4825f5ca..ca725869 100644 --- a/lama/tests/test_standard_stats.py +++ b/lama/tests/test_standard_stats.py @@ -2,7 +2,6 @@ Currently just running and making sure there's no uncaught exceptions. TODO: check that the correct output is generated too -To run these tests, the test data needs to be fechted from bit/dev/lama_stats_test_data In future we should put this in a web-accessible place Usage: pytest test_standard_stats.py @@ -23,11 +22,11 @@ root_config = dict( stats_types=[ - # 'organ_volumes', + 'organ_volumes', 'intensity', 'jacobians' ], - + normalise_organ_vol_to_mask=True, reg_folder='similarity', jac_folder='similarity', mask='mask_tight.nrrd', @@ -65,7 +64,6 @@ def make_config(config_updates={}): return make_config - def test_all(get_config): """ Run the stats module. The data required for this to work must be initially made diff --git a/lama/utilities/data_clean_up.py b/lama/utilities/data_clean_up.py new file mode 100644 index 00000000..e7899ea7 --- /dev/null +++ b/lama/utilities/data_clean_up.py @@ -0,0 +1,88 @@ +""" +LAMA produces lots of data. Sometimes we can get rid of much of it afterwrads. +This script removes folders specified in a config file. + +This is a work in progress + + +example yaml config. +This will delete all folders named 'resolution_images'. +And will delete all contents of registraitons except folder named 'similarity' +-------------------- +folders_to_rm: + resolution_images: [] + registrations: [similarity] + +-------------------- + +python3 data_clean_up.py config root_dir + +This will recursively search directories and delete any folder called in the list +""" + +from pathlib import Path +import shutil +import yaml +from typing import Iterable + + +def is_subseq(x: Iterable, y: Iterable) -> bool: + """ + Check whether x is within y. + + For example + registrations/deformable is in output/registration/deformable/192_12 + + """ + it = iter(y) + return all(c in it for c in x) + + +def rm_by_name(root: Path, name: str, to_keep): + """ + Remove directories. If any path or part path is in to keep, delete the rest of the folder put keep that one. + """ + dirs = root.glob(f'**/{name}') # Get all matching directories + + for d in dirs: + + subfolders_to_keep = [] + + if not d.is_dir(): + continue + + for subdir in d.iterdir(): + if not subdir.is_dir(): + continue + + for subseq in to_keep: + + if is_subseq(Path(subseq).parts, subdir.parts): + print(f'{subdir} is a subseq of {subdir}') + subfolders_to_keep.append(subdir) + + if not subfolders_to_keep: + shutil.rmtree(d) + else: + for subdir in d.iterdir(): + if not subdir.is_dir(): + continue + if subdir not in subfolders_to_keep: + shutil.rmtree(subdir) + + +def run(config_path: str, root_dir: Path): + + with open(config_path) as fh: + config = yaml.load(fh) + print(f"deleting {config['folders_to_rm']}") + + for dir_, subdirs_to_keep in config['folders_to_rm'].items(): + rm_by_name(root_dir, dir_, subdirs_to_keep) + + +if __name__ == '__main__': + import sys + config_path = sys.argv[1] + root_dir = sys.argv[2] + run(config_path, Path(root_dir)) diff --git a/lama/utilities/lama_convert_16_to_8.py b/lama/utilities/lama_convert_16_to_8.py index 09db0ab1..06bb0004 100755 --- a/lama/utilities/lama_convert_16_to_8.py +++ b/lama/utilities/lama_convert_16_to_8.py @@ -2,15 +2,18 @@ import nrrd from pathlib import Path +import itertools import numpy as np from lama import common -def convert_16_bit_to_8bit(indir, outdir, clobber: bool): +def convert_16_bit_to_8bit(indirs, outdir, clobber: bool): - paths = common.get_file_paths(Path(indir)) + paths = [common.get_file_paths(Path(x)) for x in indirs] + paths = itertools.chain(*paths) for inpath in paths: + print(f'doing {inpath.name}') arr, header = nrrd.read(str(inpath)) if arr.dtype not in (np.uint16, np.int16): @@ -45,9 +48,11 @@ def convert_16_bit_to_8bit(indir, outdir, clobber: bool): def main(): import argparse parser = argparse.ArgumentParser("Rescale 16 bit images to 8bit") - parser.add_argument('-i', dest='indir', help='dir with vols to convert. Will include subdirectories', required=True, nargs='*') + parser.add_argument('-i', dest='indirs', help='dir with vols to convert. Will include subdirectories', required=True, nargs='*') parser.add_argument('-o', dest='outdir', help='dir to put vols in. Omit to overwtrite source and use --clobber', required=False, default=None) + parser.add_argument('--clobber', dest='clobber', help='Overwrite inputs!', action='store_true', + default=None) args = parser.parse_args() @@ -58,9 +63,9 @@ def main(): else: outdir = None - indirs = [Path(x) for x in args.indirs] + # indirs = [Path(x) for x in args.indirs] - convert_16_bit_to_8bit(indirs, outdir, args.clobber) + convert_16_bit_to_8bit(args.indirs, outdir, args.clobber) if __name__ == '__main__': main() diff --git a/lama/utilities/lama_pad_volumes.py b/lama/utilities/lama_pad_volumes.py index 4642f3ae..a9d290e3 100755 --- a/lama/utilities/lama_pad_volumes.py +++ b/lama/utilities/lama_pad_volumes.py @@ -28,6 +28,7 @@ from pathlib import Path import SimpleITK as sitk +import nrrd from logzero import logger as logging from lama import common diff --git a/setup.py b/setup.py index aab48181..45e907c8 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='lama_phenotype_detection', download_url='https://github.com/mpi2/lama/archive/0.9.4.tar.gz', - version='0.9.50', + version='0.9.53', packages=find_packages(exclude=("dev")), package_data={'': ['current_commit', 'stats/rscripts/lmFast.R', @@ -15,7 +15,7 @@ 'appdirs', 'matplotlib>=2.2.0', 'numpy>=1.15.0', - 'pandas>=0.23.4', + 'pandas>=1.1.0', 'scikit-learn>=0.19.2', 'scipy>=1.1.0', 'scikit-image>=0.15.0',