msarrias · msarrias · May 15, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/README.md b/README.md
@@ -47,7 +47,7 @@ You should now be able to run `exonize -h`.
 
 
 * [`BLAST+`](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) \[[download link](https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/)\]: exonize uses the `tblastx` program for conducting the local search.  
-* [`MUSCLE`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches.  
+* [`MUSCLE (v.5.3)`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches.  
 * [`SQLite`](https://www.sqlite.org/index.html)[[download link](https://www.sqlite.org/download.html)]: for storing the search results.
 
 

diff --git a/docs/index.md b/docs/index.md
@@ -42,6 +42,6 @@ Requirements
 `exonize` requires a local installation of:
 
 * [`BLAST+`](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) \[[download link](https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/)\]: exonize uses the `tblastx` program for conducting the local search.  
-* [`MUSCLE`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches.  
-* [`SQLite`](https://www.sqlite.org/index.html)[[download link](https://www.sqlite.org/download.html)] : for storing the search results. **_Note:_** If you are a MacOS user, SQLite is included by default.
+* [`MUSCLE (v.5.3)`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches.  
+* [`SQLite`](https://www.sqlite.org/index.html)[[download link](https://www.sqlite.org/download.html)] : for storing the search results. **_Note:_** On macOS, SQLite is pre-installed by default.
 
diff --git a/exonize/classifier_handler.py b/exonize/classifier_handler.py
@@ -53,15 +53,6 @@ def get_missing_coordinates(
             return missing_coordinates[0]
         return missing_coordinates if missing_coordinates else ''
 
-    @staticmethod
-    def intersect_tuples(tuples):
-        if not tuples:
-            return ()
-        intersected = set(tuples[0])
-        for t in tuples[1:]:
-            intersected.intersection_update(t)
-        return tuple(intersected) if intersected else ()
-
     def get_coding_events_transcript_counts(
             self,
             gene_id: str,
@@ -107,11 +98,11 @@ def interdependence_classification(
             gene_id: str,
             id_: int,
             transcript_counts_list: list,
-            n_coding_events: int
+            n_coding_events: int,
+            coding_events_coordinates: list
     ) -> tuple:
         n_mrnas = len(transcript_counts_list)
         classification_sums = self._calculate_classification_sums(transcript_counts_list)
-        intersection = self._find_intersection(transcript_counts_list)
 
         temp = (
             gene_id,
@@ -125,10 +116,14 @@ def interdependence_classification(
         )
 
         category, exclusive_events = self._determine_category(
-            n_mrnas, n_coding_events, classification_sums, intersection, transcript_counts_list
+            n_mrnas, n_coding_events, classification_sums, transcript_counts_list, coding_events_coordinates
         )
+        exclusive_events_str = ''
+        if exclusive_events:
+            temp_list_events = [tuple(event) if len(event) > 1 else list(event).pop() for event in exclusive_events]
+            exclusive_events_str = '_'.join(map(str, temp_list_events))
 
-        return *temp, category, '_'.join(map(str, exclusive_events)) if exclusive_events else ''
+        return *temp, category, exclusive_events_str
 
     @staticmethod
     def _calculate_classification_sums(
@@ -140,36 +135,33 @@ def _calculate_classification_sums(
             for i, category in enumerate(['all', 'present', 'abscent', 'neither'])
         }
 
-    def _find_intersection(
-            self,
-            transcript_counts_list: list
-    ):
-        """Find the intersection of missing events."""
-        missing_events = [
-            missing_coordinates
-            for *_, missing_coordinates in transcript_counts_list
-            if missing_coordinates
-        ]
-        return self.intersect_tuples(tuples=missing_events) if missing_events else None
-
     @staticmethod
+    def _find_related_items(item, list_items):
+        def check_condition(itemi, itemj):
+            return bool(set(itemi).intersection(set(itemj))) if itemi != itemj else {}
-            return bool(set(itemi).intersection(set(itemj))) if itemi != itemj else {}
+            return bool(set(itemi).intersection(set(itemj))) if itemi != itemj else False
-            return bool(set(itemi).intersection(set(itemj))) if itemi != itemj else {}
+            return bool(set(itemi).intersection(set(itemj))) if itemi != itemj else False
+
+        temp_list = [other_item for other_item in list_items if check_condition(item, other_item)]
+        return temp_list
+
     def _determine_category(
+            self,
             n_mrnas: int,
             n_coding_events: int,
             classification_sums: dict,
-            intersection: tuple,
-            transcript_counts_list: list
+            transcript_counts_list: list,
+            coding_events_coordinates: list
     ):
         """Determine the category and exclusive events based on classification sums and intersection."""
         category = ''
         exclusive_events = None
         N = n_mrnas * n_coding_events
-        missing_events = [
-            missing_coordinates
-            for *_, missing_coordinates in transcript_counts_list
-            if missing_coordinates
-        ]
-
+        exclusive_candidates = {
+            frozenset(set(coding_events_coordinates) - set(missing_events))
+            for *_, missing_events in transcript_counts_list
+            # we exclude the case where all events are missing
+            if set(coding_events_coordinates) - set(missing_events) != set(coding_events_coordinates)
+        }
+        intersection = [item for item in exclusive_candidates if self._find_related_items(item, exclusive_candidates)]
         if classification_sums['all'] == N:
             category = 'OBLIGATE'
         elif classification_sums['neither'] == N:
@@ -184,12 +176,14 @@ def _determine_category(
             else:
                 if not intersection:
                     category = 'OPTIONAL_EXCLUSIVE'
-                    exclusive_events = set(missing_events)
+                    exclusive_events = exclusive_candidates
                 elif intersection:
                     category = 'OPTIONAL_FLEXIBLE'
         elif not intersection:
             category = 'EXCLUSIVE'
-            exclusive_events = set(missing_events)
+            exclusive_events = exclusive_candidates
+        else:
+            category = '-'
 
         return category, exclusive_events
 
@@ -209,7 +203,8 @@ def classify_expansion_interdependence(
                     gene_id=gene_id,
                     id_=expansion_id,
                     transcript_counts_list=transcript_counts_list,
-                    n_coding_events=n_events
+                    n_coding_events=n_events,
+                    coding_events_coordinates=expansion_coding_events_coordinates
                 )
                 expansions_classification_tuples.append(classified_expansion)
         return expansions_classification_tuples
@@ -230,6 +225,7 @@ def classify_coding_match_interdependence(
             gene_id=gene_id,
             id_=match_id,
             transcript_counts_list=transcript_counts_list,
-            n_coding_events=len(match_coding_events_coordinates)
+            n_coding_events=len(match_coding_events_coordinates),
+            coding_events_coordinates=match_coding_events_coordinates
         )
         return classified_match
diff --git a/exonize/data_preprocessor.py b/exonize/data_preprocessor.py
@@ -62,7 +62,7 @@ def sort_list_intervals_dict(
         """
         return sorted(
             list_dictionaries,
-            key=lambda x: (x['coordinate'].lower, x['coordinate']),
+            key=lambda x: (x['coordinate'].lower, x['coordinate'].upper),
             reverse=reverse
         )
 
@@ -90,6 +90,12 @@ def get_interval_length(
             return round(intersection_span / longest_length, 3)
         return 0.0
 
+    @staticmethod
+    def interval_length(
+            interval: P.Interval
+    ):
+        return interval.upper - interval.lower + 1
+
     @staticmethod
     def reverse_sequence_bool(
             gene_strand: str,
@@ -629,7 +635,9 @@ def construct_peptide_sequences(
             mrna_peptide_sequence += cds_peptide_sequence
             start_coord = end_coord
             frame_cds = frame_next_cds
-            transcript_dict[P.open(start, end)] = [coord_idx, frame_cds, cds_dna_sequence, cds_peptide_sequence]
+            transcript_dict[P.open(start, end)] = [
+                coord_idx, int(frame_cds), cds_dna_sequence, cds_peptide_sequence
+            ]
         return mrna_peptide_sequence, transcript_dict
 
     @staticmethod

diff --git a/exonize/environment_setup.py b/exonize/environment_setup.py
@@ -2,6 +2,7 @@
 import os
 import shutil
 import sys
+import subprocess
 from pathlib import Path
 from datetime import date
 
@@ -158,18 +159,33 @@ def base_settings(
 
     def check_if_tool_installed(
             self,
-            name: str
+            name: str,
+            version: str = None,
     ) -> None:
         if shutil.which(name) is None:
-            self.logger.error(f"Error: {name} is not installed or not in your PATH environment variable.")
+            self.logger.error(
+                f"Error: {name} is not installed or not in your PATH environment variable."
+            )
             sys.exit(1)
+        else:
+            if version:
+                program_version = subprocess.run(
+                    [name, '--version'],
+                    capture_output=True,
+                    text=True
+                )
+                if version not in program_version.stdout.strip():
+                    self.logger.error(
+                        f"Error: {name} version {version} is not installed, please upgrade/install it."
+                    )
+                    sys.exit(1)
 
     def check_software_requirements(self):
         if os.getenv("CI") == "true":
             # Skip software checks in CI environment
             return
         self.check_if_tool_installed(name='sqlite3')
-        self.check_if_tool_installed(name='muscle')
+        self.check_if_tool_installed(name='muscle', version='5.3')
         if self.SEARCH_ALL:
             self.check_if_tool_installed(name='tblastx')
 

diff --git a/exonize/exonize.py b/exonize/exonize.py
@@ -32,7 +32,7 @@ def exonize_ascii_art_logo() -> None:
 
 def argument_parser():
     parser = argparse.ArgumentParser(
-        description='Exonize: A tool for discovering exon duplications.'
+        description='exonize: A tool for discovering exon duplications.'
     )
     # Required Arguments
     parser.add_argument(
@@ -182,7 +182,7 @@ def argument_parser():
         '--csv',
         action='store_true',
         default=False,
-        help='If set, Exonize will output a .zip file with a reduced set of the results in CSV format.'
+        help='If set, exonize will output a .zip file with a reduced set of the results in CSV format.'
     )
     # Optional Arguments for Numerical Values and Thresholds
     parser.add_argument(

diff --git a/exonize/reconciler_handler.py b/exonize/reconciler_handler.py
@@ -789,7 +789,7 @@ def align_target_coordinates(
         )
         gene_cds_set = set(
             coord for coord, frame in self.data_container.fetch_gene_cdss_set(gene_id=gene_id)
-            if coord.upper - coord.lower >= self.environment.min_exon_length
+            if self.data_container.interval_length(coord) >= self.environment.min_exon_length
         )
         gene_cds_set = set(
             self.center_and_sort_cds_coordinates(

diff --git a/exonize/searcher.py b/exonize/searcher.py
@@ -106,9 +106,9 @@ def execute_muscle(
     ):
         muscle_command = [
             "muscle",
-            "-in",
+            "-align",
             seq_file_path,
-            "-out",
+            "-output",
             output_file_path
         ]
         subprocess.run(
@@ -362,7 +362,7 @@ def fetch_clusters(
         return self.data_container.get_overlapping_clusters(
             target_coordinates_set=set(
                 (coordinate, None) for coordinate, frame in cds_coordinates_and_frames
-                if coordinate.upper - coordinate.lower >= self.environment.min_exon_length),
+                if self.data_container.interval_length(coordinate) >= self.environment.min_exon_length),
             threshold=self.environment.exon_clustering_overlap_threshold
         )
 

diff --git a/exonize/sqlite_handler.py b/exonize/sqlite_handler.py
@@ -260,7 +260,8 @@ def create_expansions_table(
                                         '{self.environment.partial_insertion}',
                                         '{self.environment.partial_excision}',
                                         '{self.environment.intronic}',
-                                        '{self.environment.inter_boundary}'
+                                        '{self.environment.inter_boundary}',
+                                        '-'
                  )),
                 EventStart INTEGER NOT NULL,
                 EventEnd INTEGER NOT NULL,
@@ -292,7 +293,8 @@ def create_expansions_table(
                                         'FLEXIBLE',
                                         'OPTIONAL_FLEXIBLE',
                                         'OPTIONAL_EXCLUSIVE',
-                                        'OPTIONAL_OBLIGATE'
+                                        'OPTIONAL_OBLIGATE',
+                                        '-'
                  )),
                 ExclusiveEvents TEXT,
                 FOREIGN KEY (GeneID) REFERENCES Genes(GeneID),

diff --git a/tests/test_classification.py b/tests/test_classification.py
@@ -394,7 +394,6 @@ def create_exonize_test2():
                         {'coordinate': P.open(3100, 3200), 'type': 'CDS', 'frame': 0},
                         {'coordinate': P.open(3400, 3500), 'type': 'CDS', 'frame': 0},
                         {'coordinate': P.open(4000, 4100), 'type': 'CDS', 'frame': 0},
-                        {'coordinate': P.open(4200, 4300), 'type': 'CDS', 'frame': 0},
 
                     ]
                 },
@@ -405,8 +404,7 @@ def create_exonize_test2():
                         {'coordinate': P.open(2900, 3000), 'type': 'CDS', 'frame': 0},
                         {'coordinate': P.open(3100, 3200), 'type': 'CDS', 'frame': 0},
                         {'coordinate': P.open(3400, 3500), 'type': 'CDS', 'frame': 0},
-                        {'coordinate': P.open(4000, 4100), 'type': 'CDS', 'frame': 0},
-                        {'coordinate': P.open(4500, 4600), 'type': 'CDS', 'frame': 0},
+                        {'coordinate': P.open(4200, 4300), 'type': 'CDS', 'frame': 0}
 
                     ]
                 },
@@ -418,7 +416,6 @@ def create_exonize_test2():
                         {'coordinate': P.open(1400, 1500), 'type': 'CDS', 'frame': 0},
                         {'coordinate': P.open(1650, 1750), 'type': 'CDS', 'frame': 0},
                         {'coordinate': P.open(2700, 2800), 'type': 'CDS', 'frame': 0},
-                        {'coordinate': P.open(4200, 4300), 'type': 'CDS', 'frame': 0},
                         {'coordinate': P.open(4500, 4600), 'type': 'CDS', 'frame': 0},
                     ]
                 }
@@ -543,34 +540,34 @@ def create_exonize_test2():
     return exonize_obj2, results_db_path2
 
 
-def test_expansion_transcript_iterdependence_classification():
+def test_expansion_transcript_interdependence_classification():
     _, results_db_path2 = create_exonize_test2()
     expected_expansions_classification = [
-        ('gene1', 1, 3, 2, 4, 1, 1, 0, 'FLEXIBLE', ''),  # n x (k + 1) = 3 x (1 + 1) = 6
-        ('gene1', 2, 3, 2, 6, 0, 0, 0, 'OBLIGATE', ''),
-        ('gene2', 0, 3, 3, 0, 5, 4, 0, 'EXCLUSIVE',
-         '_'.join([
-             str(i)
-             for i in (
-                 P.open(600, 700),
-                 tuple((P.open(0, 100), P.open(150, 250)))
-             )
-         ])),
-        ('gene2', 1, 3, 3, 0, 6, 3, 0, 'EXCLUSIVE',
-         '_'.join([str(i) for i in
-                   (P.open(4500, 4600),
-                    P.open(4200, 4300),
-                    P.open(4000, 4100))
-                   ])),
-        ('gene2', 2, 3, 3, 3, 1, 2, 3, 'OPTIONAL_FLEXIBLE', ''),
-        ('gene2', 3, 3, 3, 0, 3, 3, 3, 'OPTIONAL_EXCLUSIVE',
-         '_'.join([str(i) for i
-                   in (P.open(2700, 2800),
-                       (P.open(2100, 2200),
-                        P.open(2400, 2500)))
-                   ])),
-        ('gene2', 4, 3, 3, 6, 0, 0, 3, 'OPTIONAL_OBLIGATE', ''),
-    ]
+    ('gene1', 1, 3, 2, 4, 1, 1, 0, 'FLEXIBLE', ''),  # n x (k + 1) = 3 x (1 + 1) = 6
+    ('gene1', 2, 3, 2, 6, 0, 0, 0, 'OBLIGATE', ''),
+    ('gene2', 0, 3, 3, 0, 5, 4, 0, 'EXCLUSIVE',
+     '_'.join([
+         str(i)
+         for i in (
+             P.open(600, 700),
+             tuple((P.open(150, 250),P.open(0, 100)))
+         )
+     ])),
+    ('gene2', 1, 3, 3, 0, 3, 6, 0, 'EXCLUSIVE',
+     '_'.join([str(i) for i in
+               (P.open(4200, 4300),
+                P.open(4500, 4600),
+                P.open(4000, 4100)
+               )
+               ])),
+    ('gene2', 2, 3, 3, 3, 1, 2, 3, 'OPTIONAL_FLEXIBLE', ''),
+    ('gene2', 3, 3, 3, 0, 3, 3, 3, 'OPTIONAL_EXCLUSIVE',
+     '_'.join([str(i) for i
+               in ((P.open(2400, 2500),
+                    P.open(2100, 2200)), P.open(2700, 2800))
+               ])),
+    ('gene2', 4, 3, 3, 6, 0, 0, 3, 'OPTIONAL_OBLIGATE', ''),
+]
     with sqlite3.connect(results_db_path2) as db:
         cursor = db.cursor()
         cursor.execute(