diff --git a/README.md b/README.md index a02e116..e9fe113 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ You should now be able to run `exonize -h`. * [`BLAST+`](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) \[[download link](https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/)\]: exonize uses the `tblastx` program for conducting the local search. -* [`MUSCLE`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches. +* [`MUSCLE (v.5.3)`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches. * [`SQLite`](https://www.sqlite.org/index.html)[[download link](https://www.sqlite.org/download.html)]: for storing the search results. diff --git a/docs/index.md b/docs/index.md index c4c0786..c30f3c8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -42,6 +42,6 @@ Requirements `exonize` requires a local installation of: * [`BLAST+`](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) \[[download link](https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/)\]: exonize uses the `tblastx` program for conducting the local search. -* [`MUSCLE`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches. -* [`SQLite`](https://www.sqlite.org/index.html)[[download link](https://www.sqlite.org/download.html)] : for storing the search results. **_Note:_** If you are a MacOS user, SQLite is included by default. +* [`MUSCLE (v.5.3)`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches. +* [`SQLite`](https://www.sqlite.org/index.html)[[download link](https://www.sqlite.org/download.html)] : for storing the search results. **_Note:_** On macOS, SQLite is pre-installed by default. diff --git a/exonize/classifier_handler.py b/exonize/classifier_handler.py index 8f9d411..890689c 100644 --- a/exonize/classifier_handler.py +++ b/exonize/classifier_handler.py @@ -53,15 +53,6 @@ def get_missing_coordinates( return missing_coordinates[0] return missing_coordinates if missing_coordinates else '' - @staticmethod - def intersect_tuples(tuples): - if not tuples: - return () - intersected = set(tuples[0]) - for t in tuples[1:]: - intersected.intersection_update(t) - return tuple(intersected) if intersected else () - def get_coding_events_transcript_counts( self, gene_id: str, @@ -107,11 +98,11 @@ def interdependence_classification( gene_id: str, id_: int, transcript_counts_list: list, - n_coding_events: int + n_coding_events: int, + coding_events_coordinates: list ) -> tuple: n_mrnas = len(transcript_counts_list) classification_sums = self._calculate_classification_sums(transcript_counts_list) - intersection = self._find_intersection(transcript_counts_list) temp = ( gene_id, @@ -125,10 +116,14 @@ def interdependence_classification( ) category, exclusive_events = self._determine_category( - n_mrnas, n_coding_events, classification_sums, intersection, transcript_counts_list + n_mrnas, n_coding_events, classification_sums, transcript_counts_list, coding_events_coordinates ) + exclusive_events_str = '' + if exclusive_events: + temp_list_events = [tuple(event) if len(event) > 1 else list(event).pop() for event in exclusive_events] + exclusive_events_str = '_'.join(map(str, temp_list_events)) - return *temp, category, '_'.join(map(str, exclusive_events)) if exclusive_events else '' + return *temp, category, exclusive_events_str @staticmethod def _calculate_classification_sums( @@ -140,36 +135,33 @@ def _calculate_classification_sums( for i, category in enumerate(['all', 'present', 'abscent', 'neither']) } - def _find_intersection( - self, - transcript_counts_list: list - ): - """Find the intersection of missing events.""" - missing_events = [ - missing_coordinates - for *_, missing_coordinates in transcript_counts_list - if missing_coordinates - ] - return self.intersect_tuples(tuples=missing_events) if missing_events else None - @staticmethod + def _find_related_items(item, list_items): + def check_condition(itemi, itemj): + return bool(set(itemi).intersection(set(itemj))) if itemi != itemj else {} + + temp_list = [other_item for other_item in list_items if check_condition(item, other_item)] + return temp_list + def _determine_category( + self, n_mrnas: int, n_coding_events: int, classification_sums: dict, - intersection: tuple, - transcript_counts_list: list + transcript_counts_list: list, + coding_events_coordinates: list ): """Determine the category and exclusive events based on classification sums and intersection.""" category = '' exclusive_events = None N = n_mrnas * n_coding_events - missing_events = [ - missing_coordinates - for *_, missing_coordinates in transcript_counts_list - if missing_coordinates - ] - + exclusive_candidates = { + frozenset(set(coding_events_coordinates) - set(missing_events)) + for *_, missing_events in transcript_counts_list + # we exclude the case where all events are missing + if set(coding_events_coordinates) - set(missing_events) != set(coding_events_coordinates) + } + intersection = [item for item in exclusive_candidates if self._find_related_items(item, exclusive_candidates)] if classification_sums['all'] == N: category = 'OBLIGATE' elif classification_sums['neither'] == N: @@ -184,12 +176,14 @@ def _determine_category( else: if not intersection: category = 'OPTIONAL_EXCLUSIVE' - exclusive_events = set(missing_events) + exclusive_events = exclusive_candidates elif intersection: category = 'OPTIONAL_FLEXIBLE' elif not intersection: category = 'EXCLUSIVE' - exclusive_events = set(missing_events) + exclusive_events = exclusive_candidates + else: + category = '-' return category, exclusive_events @@ -209,7 +203,8 @@ def classify_expansion_interdependence( gene_id=gene_id, id_=expansion_id, transcript_counts_list=transcript_counts_list, - n_coding_events=n_events + n_coding_events=n_events, + coding_events_coordinates=expansion_coding_events_coordinates ) expansions_classification_tuples.append(classified_expansion) return expansions_classification_tuples @@ -230,6 +225,7 @@ def classify_coding_match_interdependence( gene_id=gene_id, id_=match_id, transcript_counts_list=transcript_counts_list, - n_coding_events=len(match_coding_events_coordinates) + n_coding_events=len(match_coding_events_coordinates), + coding_events_coordinates=match_coding_events_coordinates ) return classified_match diff --git a/exonize/data_preprocessor.py b/exonize/data_preprocessor.py index dae8759..6ed0946 100644 --- a/exonize/data_preprocessor.py +++ b/exonize/data_preprocessor.py @@ -62,7 +62,7 @@ def sort_list_intervals_dict( """ return sorted( list_dictionaries, - key=lambda x: (x['coordinate'].lower, x['coordinate']), + key=lambda x: (x['coordinate'].lower, x['coordinate'].upper), reverse=reverse ) @@ -90,6 +90,12 @@ def get_interval_length( return round(intersection_span / longest_length, 3) return 0.0 + @staticmethod + def interval_length( + interval: P.Interval + ): + return interval.upper - interval.lower + 1 + @staticmethod def reverse_sequence_bool( gene_strand: str, @@ -629,7 +635,9 @@ def construct_peptide_sequences( mrna_peptide_sequence += cds_peptide_sequence start_coord = end_coord frame_cds = frame_next_cds - transcript_dict[P.open(start, end)] = [coord_idx, frame_cds, cds_dna_sequence, cds_peptide_sequence] + transcript_dict[P.open(start, end)] = [ + coord_idx, int(frame_cds), cds_dna_sequence, cds_peptide_sequence + ] return mrna_peptide_sequence, transcript_dict @staticmethod diff --git a/exonize/environment_setup.py b/exonize/environment_setup.py index 17e128c..4e9d7b3 100644 --- a/exonize/environment_setup.py +++ b/exonize/environment_setup.py @@ -2,6 +2,7 @@ import os import shutil import sys +import subprocess from pathlib import Path from datetime import date @@ -158,18 +159,33 @@ def base_settings( def check_if_tool_installed( self, - name: str + name: str, + version: str = None, ) -> None: if shutil.which(name) is None: - self.logger.error(f"Error: {name} is not installed or not in your PATH environment variable.") + self.logger.error( + f"Error: {name} is not installed or not in your PATH environment variable." + ) sys.exit(1) + else: + if version: + program_version = subprocess.run( + [name, '--version'], + capture_output=True, + text=True + ) + if version not in program_version.stdout.strip(): + self.logger.error( + f"Error: {name} version {version} is not installed, please upgrade/install it." + ) + sys.exit(1) def check_software_requirements(self): if os.getenv("CI") == "true": # Skip software checks in CI environment return self.check_if_tool_installed(name='sqlite3') - self.check_if_tool_installed(name='muscle') + self.check_if_tool_installed(name='muscle', version='5.3') if self.SEARCH_ALL: self.check_if_tool_installed(name='tblastx') diff --git a/exonize/exonize.py b/exonize/exonize.py index 3188f48..26ea634 100644 --- a/exonize/exonize.py +++ b/exonize/exonize.py @@ -32,7 +32,7 @@ def exonize_ascii_art_logo() -> None: def argument_parser(): parser = argparse.ArgumentParser( - description='Exonize: A tool for discovering exon duplications.' + description='exonize: A tool for discovering exon duplications.' ) # Required Arguments parser.add_argument( @@ -182,7 +182,7 @@ def argument_parser(): '--csv', action='store_true', default=False, - help='If set, Exonize will output a .zip file with a reduced set of the results in CSV format.' + help='If set, exonize will output a .zip file with a reduced set of the results in CSV format.' ) # Optional Arguments for Numerical Values and Thresholds parser.add_argument( diff --git a/exonize/reconciler_handler.py b/exonize/reconciler_handler.py index ffc6603..86f1820 100644 --- a/exonize/reconciler_handler.py +++ b/exonize/reconciler_handler.py @@ -789,7 +789,7 @@ def align_target_coordinates( ) gene_cds_set = set( coord for coord, frame in self.data_container.fetch_gene_cdss_set(gene_id=gene_id) - if coord.upper - coord.lower >= self.environment.min_exon_length + if self.data_container.interval_length(coord) >= self.environment.min_exon_length ) gene_cds_set = set( self.center_and_sort_cds_coordinates( diff --git a/exonize/searcher.py b/exonize/searcher.py index 803dcb2..60adfa2 100644 --- a/exonize/searcher.py +++ b/exonize/searcher.py @@ -106,9 +106,9 @@ def execute_muscle( ): muscle_command = [ "muscle", - "-in", + "-align", seq_file_path, - "-out", + "-output", output_file_path ] subprocess.run( @@ -362,7 +362,7 @@ def fetch_clusters( return self.data_container.get_overlapping_clusters( target_coordinates_set=set( (coordinate, None) for coordinate, frame in cds_coordinates_and_frames - if coordinate.upper - coordinate.lower >= self.environment.min_exon_length), + if self.data_container.interval_length(coordinate) >= self.environment.min_exon_length), threshold=self.environment.exon_clustering_overlap_threshold ) diff --git a/exonize/sqlite_handler.py b/exonize/sqlite_handler.py index 4825bb6..5d0048f 100644 --- a/exonize/sqlite_handler.py +++ b/exonize/sqlite_handler.py @@ -260,7 +260,8 @@ def create_expansions_table( '{self.environment.partial_insertion}', '{self.environment.partial_excision}', '{self.environment.intronic}', - '{self.environment.inter_boundary}' + '{self.environment.inter_boundary}', + '-' )), EventStart INTEGER NOT NULL, EventEnd INTEGER NOT NULL, @@ -292,7 +293,8 @@ def create_expansions_table( 'FLEXIBLE', 'OPTIONAL_FLEXIBLE', 'OPTIONAL_EXCLUSIVE', - 'OPTIONAL_OBLIGATE' + 'OPTIONAL_OBLIGATE', + '-' )), ExclusiveEvents TEXT, FOREIGN KEY (GeneID) REFERENCES Genes(GeneID), diff --git a/tests/test_classification.py b/tests/test_classification.py index 098a9f6..4a29631 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -394,7 +394,6 @@ def create_exonize_test2(): {'coordinate': P.open(3100, 3200), 'type': 'CDS', 'frame': 0}, {'coordinate': P.open(3400, 3500), 'type': 'CDS', 'frame': 0}, {'coordinate': P.open(4000, 4100), 'type': 'CDS', 'frame': 0}, - {'coordinate': P.open(4200, 4300), 'type': 'CDS', 'frame': 0}, ] }, @@ -405,8 +404,7 @@ def create_exonize_test2(): {'coordinate': P.open(2900, 3000), 'type': 'CDS', 'frame': 0}, {'coordinate': P.open(3100, 3200), 'type': 'CDS', 'frame': 0}, {'coordinate': P.open(3400, 3500), 'type': 'CDS', 'frame': 0}, - {'coordinate': P.open(4000, 4100), 'type': 'CDS', 'frame': 0}, - {'coordinate': P.open(4500, 4600), 'type': 'CDS', 'frame': 0}, + {'coordinate': P.open(4200, 4300), 'type': 'CDS', 'frame': 0} ] }, @@ -418,7 +416,6 @@ def create_exonize_test2(): {'coordinate': P.open(1400, 1500), 'type': 'CDS', 'frame': 0}, {'coordinate': P.open(1650, 1750), 'type': 'CDS', 'frame': 0}, {'coordinate': P.open(2700, 2800), 'type': 'CDS', 'frame': 0}, - {'coordinate': P.open(4200, 4300), 'type': 'CDS', 'frame': 0}, {'coordinate': P.open(4500, 4600), 'type': 'CDS', 'frame': 0}, ] } @@ -543,34 +540,34 @@ def create_exonize_test2(): return exonize_obj2, results_db_path2 -def test_expansion_transcript_iterdependence_classification(): +def test_expansion_transcript_interdependence_classification(): _, results_db_path2 = create_exonize_test2() expected_expansions_classification = [ - ('gene1', 1, 3, 2, 4, 1, 1, 0, 'FLEXIBLE', ''), # n x (k + 1) = 3 x (1 + 1) = 6 - ('gene1', 2, 3, 2, 6, 0, 0, 0, 'OBLIGATE', ''), - ('gene2', 0, 3, 3, 0, 5, 4, 0, 'EXCLUSIVE', - '_'.join([ - str(i) - for i in ( - P.open(600, 700), - tuple((P.open(0, 100), P.open(150, 250))) - ) - ])), - ('gene2', 1, 3, 3, 0, 6, 3, 0, 'EXCLUSIVE', - '_'.join([str(i) for i in - (P.open(4500, 4600), - P.open(4200, 4300), - P.open(4000, 4100)) - ])), - ('gene2', 2, 3, 3, 3, 1, 2, 3, 'OPTIONAL_FLEXIBLE', ''), - ('gene2', 3, 3, 3, 0, 3, 3, 3, 'OPTIONAL_EXCLUSIVE', - '_'.join([str(i) for i - in (P.open(2700, 2800), - (P.open(2100, 2200), - P.open(2400, 2500))) - ])), - ('gene2', 4, 3, 3, 6, 0, 0, 3, 'OPTIONAL_OBLIGATE', ''), - ] + ('gene1', 1, 3, 2, 4, 1, 1, 0, 'FLEXIBLE', ''), # n x (k + 1) = 3 x (1 + 1) = 6 + ('gene1', 2, 3, 2, 6, 0, 0, 0, 'OBLIGATE', ''), + ('gene2', 0, 3, 3, 0, 5, 4, 0, 'EXCLUSIVE', + '_'.join([ + str(i) + for i in ( + P.open(600, 700), + tuple((P.open(150, 250),P.open(0, 100))) + ) + ])), + ('gene2', 1, 3, 3, 0, 3, 6, 0, 'EXCLUSIVE', + '_'.join([str(i) for i in + (P.open(4200, 4300), + P.open(4500, 4600), + P.open(4000, 4100) + ) + ])), + ('gene2', 2, 3, 3, 3, 1, 2, 3, 'OPTIONAL_FLEXIBLE', ''), + ('gene2', 3, 3, 3, 0, 3, 3, 3, 'OPTIONAL_EXCLUSIVE', + '_'.join([str(i) for i + in ((P.open(2400, 2500), + P.open(2100, 2200)), P.open(2700, 2800)) + ])), + ('gene2', 4, 3, 3, 6, 0, 0, 3, 'OPTIONAL_OBLIGATE', ''), +] with sqlite3.connect(results_db_path2) as db: cursor = db.cursor() cursor.execute(