Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ You should now be able to run `exonize -h`.


* [`BLAST+`](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) \[[download link](https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/)\]: exonize uses the `tblastx` program for conducting the local search.
* [`MUSCLE`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches.
* [`MUSCLE (v.5.3)`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches.
* [`SQLite`](https://www.sqlite.org/index.html)[[download link](https://www.sqlite.org/download.html)]: for storing the search results.


Expand Down
4 changes: 2 additions & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ Requirements
`exonize` requires a local installation of:

* [`BLAST+`](https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html) \[[download link](https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/)\]: exonize uses the `tblastx` program for conducting the local search.
* [`MUSCLE`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches.
* [`SQLite`](https://www.sqlite.org/index.html)[[download link](https://www.sqlite.org/download.html)] : for storing the search results. **_Note:_** If you are a MacOS user, SQLite is included by default.
* [`MUSCLE (v.5.3)`](https://www.drive5.com/muscle/) \[[download link](https://github.com/rcedgar/muscle/releases)\]: used for conducting the global search and correcting the identity of reconciled matches.
* [`SQLite`](https://www.sqlite.org/index.html)[[download link](https://www.sqlite.org/download.html)] : for storing the search results. **_Note:_** On macOS, SQLite is pre-installed by default.

70 changes: 33 additions & 37 deletions exonize/classifier_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,6 @@ def get_missing_coordinates(
return missing_coordinates[0]
return missing_coordinates if missing_coordinates else ''

@staticmethod
def intersect_tuples(tuples):
if not tuples:
return ()
intersected = set(tuples[0])
for t in tuples[1:]:
intersected.intersection_update(t)
return tuple(intersected) if intersected else ()

def get_coding_events_transcript_counts(
self,
gene_id: str,
Expand Down Expand Up @@ -107,11 +98,11 @@ def interdependence_classification(
gene_id: str,
id_: int,
transcript_counts_list: list,
n_coding_events: int
n_coding_events: int,
coding_events_coordinates: list
) -> tuple:
n_mrnas = len(transcript_counts_list)
classification_sums = self._calculate_classification_sums(transcript_counts_list)
intersection = self._find_intersection(transcript_counts_list)

temp = (
gene_id,
Expand All @@ -125,10 +116,14 @@ def interdependence_classification(
)

category, exclusive_events = self._determine_category(
n_mrnas, n_coding_events, classification_sums, intersection, transcript_counts_list
n_mrnas, n_coding_events, classification_sums, transcript_counts_list, coding_events_coordinates
)
exclusive_events_str = ''
if exclusive_events:
temp_list_events = [tuple(event) if len(event) > 1 else list(event).pop() for event in exclusive_events]
exclusive_events_str = '_'.join(map(str, temp_list_events))

return *temp, category, '_'.join(map(str, exclusive_events)) if exclusive_events else ''
return *temp, category, exclusive_events_str

@staticmethod
def _calculate_classification_sums(
Expand All @@ -140,36 +135,33 @@ def _calculate_classification_sums(
for i, category in enumerate(['all', 'present', 'abscent', 'neither'])
}

def _find_intersection(
self,
transcript_counts_list: list
):
"""Find the intersection of missing events."""
missing_events = [
missing_coordinates
for *_, missing_coordinates in transcript_counts_list
if missing_coordinates
]
return self.intersect_tuples(tuples=missing_events) if missing_events else None

@staticmethod
def _find_related_items(item, list_items):
def check_condition(itemi, itemj):
return bool(set(itemi).intersection(set(itemj))) if itemi != itemj else {}
Copy link

Copilot AI May 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The condition function in _find_related_items returns an empty dictionary instead of a Boolean value when items are equal. Replace {} with false to ensure a consistent Boolean return type.

Suggested change
return bool(set(itemi).intersection(set(itemj))) if itemi != itemj else {}
return bool(set(itemi).intersection(set(itemj))) if itemi != itemj else False

Copilot uses AI. Check for mistakes.

temp_list = [other_item for other_item in list_items if check_condition(item, other_item)]
return temp_list

def _determine_category(
self,
n_mrnas: int,
n_coding_events: int,
classification_sums: dict,
intersection: tuple,
transcript_counts_list: list
transcript_counts_list: list,
coding_events_coordinates: list
):
"""Determine the category and exclusive events based on classification sums and intersection."""
category = ''
exclusive_events = None
N = n_mrnas * n_coding_events
missing_events = [
missing_coordinates
for *_, missing_coordinates in transcript_counts_list
if missing_coordinates
]

exclusive_candidates = {
frozenset(set(coding_events_coordinates) - set(missing_events))
for *_, missing_events in transcript_counts_list
# we exclude the case where all events are missing
if set(coding_events_coordinates) - set(missing_events) != set(coding_events_coordinates)
}
intersection = [item for item in exclusive_candidates if self._find_related_items(item, exclusive_candidates)]
if classification_sums['all'] == N:
category = 'OBLIGATE'
elif classification_sums['neither'] == N:
Expand All @@ -184,12 +176,14 @@ def _determine_category(
else:
if not intersection:
category = 'OPTIONAL_EXCLUSIVE'
exclusive_events = set(missing_events)
exclusive_events = exclusive_candidates
elif intersection:
category = 'OPTIONAL_FLEXIBLE'
elif not intersection:
category = 'EXCLUSIVE'
exclusive_events = set(missing_events)
exclusive_events = exclusive_candidates
else:
category = '-'

return category, exclusive_events

Expand All @@ -209,7 +203,8 @@ def classify_expansion_interdependence(
gene_id=gene_id,
id_=expansion_id,
transcript_counts_list=transcript_counts_list,
n_coding_events=n_events
n_coding_events=n_events,
coding_events_coordinates=expansion_coding_events_coordinates
)
expansions_classification_tuples.append(classified_expansion)
return expansions_classification_tuples
Expand All @@ -230,6 +225,7 @@ def classify_coding_match_interdependence(
gene_id=gene_id,
id_=match_id,
transcript_counts_list=transcript_counts_list,
n_coding_events=len(match_coding_events_coordinates)
n_coding_events=len(match_coding_events_coordinates),
coding_events_coordinates=match_coding_events_coordinates
)
return classified_match
12 changes: 10 additions & 2 deletions exonize/data_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def sort_list_intervals_dict(
"""
return sorted(
list_dictionaries,
key=lambda x: (x['coordinate'].lower, x['coordinate']),
key=lambda x: (x['coordinate'].lower, x['coordinate'].upper),
reverse=reverse
)

Expand Down Expand Up @@ -90,6 +90,12 @@ def get_interval_length(
return round(intersection_span / longest_length, 3)
return 0.0

@staticmethod
def interval_length(
interval: P.Interval
):
return interval.upper - interval.lower + 1
Copy link

Copilot AI May 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment clarifying that the '+1' adjustment is used because the interval boundaries are inclusive, to document this assumption for future maintainers.

Copilot uses AI. Check for mistakes.

@staticmethod
def reverse_sequence_bool(
gene_strand: str,
Expand Down Expand Up @@ -629,7 +635,9 @@ def construct_peptide_sequences(
mrna_peptide_sequence += cds_peptide_sequence
start_coord = end_coord
frame_cds = frame_next_cds
transcript_dict[P.open(start, end)] = [coord_idx, frame_cds, cds_dna_sequence, cds_peptide_sequence]
transcript_dict[P.open(start, end)] = [
coord_idx, int(frame_cds), cds_dna_sequence, cds_peptide_sequence
]
return mrna_peptide_sequence, transcript_dict

@staticmethod
Expand Down
22 changes: 19 additions & 3 deletions exonize/environment_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import shutil
import sys
import subprocess
from pathlib import Path
from datetime import date

Expand Down Expand Up @@ -158,18 +159,33 @@ def base_settings(

def check_if_tool_installed(
self,
name: str
name: str,
version: str = None,
) -> None:
if shutil.which(name) is None:
self.logger.error(f"Error: {name} is not installed or not in your PATH environment variable.")
self.logger.error(
f"Error: {name} is not installed or not in your PATH environment variable."
)
sys.exit(1)
else:
if version:
program_version = subprocess.run(
[name, '--version'],
capture_output=True,
text=True
)
if version not in program_version.stdout.strip():
Copy link

Copilot AI May 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Consider implementing a more robust version comparison rather than a substring check, as this can lead to false positives if version numbers share digits.

Copilot uses AI. Check for mistakes.
self.logger.error(
f"Error: {name} version {version} is not installed, please upgrade/install it."
)
sys.exit(1)

def check_software_requirements(self):
if os.getenv("CI") == "true":
# Skip software checks in CI environment
return
self.check_if_tool_installed(name='sqlite3')
self.check_if_tool_installed(name='muscle')
self.check_if_tool_installed(name='muscle', version='5.3')
if self.SEARCH_ALL:
self.check_if_tool_installed(name='tblastx')

Expand Down
4 changes: 2 additions & 2 deletions exonize/exonize.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def exonize_ascii_art_logo() -> None:

def argument_parser():
parser = argparse.ArgumentParser(
description='Exonize: A tool for discovering exon duplications.'
description='exonize: A tool for discovering exon duplications.'
)
# Required Arguments
parser.add_argument(
Expand Down Expand Up @@ -182,7 +182,7 @@ def argument_parser():
'--csv',
action='store_true',
default=False,
help='If set, Exonize will output a .zip file with a reduced set of the results in CSV format.'
help='If set, exonize will output a .zip file with a reduced set of the results in CSV format.'
)
# Optional Arguments for Numerical Values and Thresholds
parser.add_argument(
Expand Down
2 changes: 1 addition & 1 deletion exonize/reconciler_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -789,7 +789,7 @@ def align_target_coordinates(
)
gene_cds_set = set(
coord for coord, frame in self.data_container.fetch_gene_cdss_set(gene_id=gene_id)
if coord.upper - coord.lower >= self.environment.min_exon_length
if self.data_container.interval_length(coord) >= self.environment.min_exon_length
)
gene_cds_set = set(
self.center_and_sort_cds_coordinates(
Expand Down
6 changes: 3 additions & 3 deletions exonize/searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ def execute_muscle(
):
muscle_command = [
"muscle",
"-in",
"-align",
seq_file_path,
"-out",
"-output",
output_file_path
]
subprocess.run(
Expand Down Expand Up @@ -362,7 +362,7 @@ def fetch_clusters(
return self.data_container.get_overlapping_clusters(
target_coordinates_set=set(
(coordinate, None) for coordinate, frame in cds_coordinates_and_frames
if coordinate.upper - coordinate.lower >= self.environment.min_exon_length),
if self.data_container.interval_length(coordinate) >= self.environment.min_exon_length),
threshold=self.environment.exon_clustering_overlap_threshold
)

Expand Down
6 changes: 4 additions & 2 deletions exonize/sqlite_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,8 @@ def create_expansions_table(
'{self.environment.partial_insertion}',
'{self.environment.partial_excision}',
'{self.environment.intronic}',
'{self.environment.inter_boundary}'
'{self.environment.inter_boundary}',
'-'
)),
EventStart INTEGER NOT NULL,
EventEnd INTEGER NOT NULL,
Expand Down Expand Up @@ -292,7 +293,8 @@ def create_expansions_table(
'FLEXIBLE',
'OPTIONAL_FLEXIBLE',
'OPTIONAL_EXCLUSIVE',
'OPTIONAL_OBLIGATE'
'OPTIONAL_OBLIGATE',
'-'
)),
ExclusiveEvents TEXT,
FOREIGN KEY (GeneID) REFERENCES Genes(GeneID),
Expand Down
57 changes: 27 additions & 30 deletions tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,6 @@ def create_exonize_test2():
{'coordinate': P.open(3100, 3200), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(3400, 3500), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(4000, 4100), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(4200, 4300), 'type': 'CDS', 'frame': 0},

]
},
Expand All @@ -405,8 +404,7 @@ def create_exonize_test2():
{'coordinate': P.open(2900, 3000), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(3100, 3200), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(3400, 3500), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(4000, 4100), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(4500, 4600), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(4200, 4300), 'type': 'CDS', 'frame': 0}

]
},
Expand All @@ -418,7 +416,6 @@ def create_exonize_test2():
{'coordinate': P.open(1400, 1500), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(1650, 1750), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(2700, 2800), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(4200, 4300), 'type': 'CDS', 'frame': 0},
{'coordinate': P.open(4500, 4600), 'type': 'CDS', 'frame': 0},
]
}
Expand Down Expand Up @@ -543,34 +540,34 @@ def create_exonize_test2():
return exonize_obj2, results_db_path2


def test_expansion_transcript_iterdependence_classification():
def test_expansion_transcript_interdependence_classification():
_, results_db_path2 = create_exonize_test2()
expected_expansions_classification = [
('gene1', 1, 3, 2, 4, 1, 1, 0, 'FLEXIBLE', ''), # n x (k + 1) = 3 x (1 + 1) = 6
('gene1', 2, 3, 2, 6, 0, 0, 0, 'OBLIGATE', ''),
('gene2', 0, 3, 3, 0, 5, 4, 0, 'EXCLUSIVE',
'_'.join([
str(i)
for i in (
P.open(600, 700),
tuple((P.open(0, 100), P.open(150, 250)))
)
])),
('gene2', 1, 3, 3, 0, 6, 3, 0, 'EXCLUSIVE',
'_'.join([str(i) for i in
(P.open(4500, 4600),
P.open(4200, 4300),
P.open(4000, 4100))
])),
('gene2', 2, 3, 3, 3, 1, 2, 3, 'OPTIONAL_FLEXIBLE', ''),
('gene2', 3, 3, 3, 0, 3, 3, 3, 'OPTIONAL_EXCLUSIVE',
'_'.join([str(i) for i
in (P.open(2700, 2800),
(P.open(2100, 2200),
P.open(2400, 2500)))
])),
('gene2', 4, 3, 3, 6, 0, 0, 3, 'OPTIONAL_OBLIGATE', ''),
]
('gene1', 1, 3, 2, 4, 1, 1, 0, 'FLEXIBLE', ''), # n x (k + 1) = 3 x (1 + 1) = 6
('gene1', 2, 3, 2, 6, 0, 0, 0, 'OBLIGATE', ''),
('gene2', 0, 3, 3, 0, 5, 4, 0, 'EXCLUSIVE',
'_'.join([
str(i)
for i in (
P.open(600, 700),
tuple((P.open(150, 250),P.open(0, 100)))
)
])),
('gene2', 1, 3, 3, 0, 3, 6, 0, 'EXCLUSIVE',
'_'.join([str(i) for i in
(P.open(4200, 4300),
P.open(4500, 4600),
P.open(4000, 4100)
)
])),
('gene2', 2, 3, 3, 3, 1, 2, 3, 'OPTIONAL_FLEXIBLE', ''),
('gene2', 3, 3, 3, 0, 3, 3, 3, 'OPTIONAL_EXCLUSIVE',
'_'.join([str(i) for i
in ((P.open(2400, 2500),
P.open(2100, 2200)), P.open(2700, 2800))
])),
('gene2', 4, 3, 3, 6, 0, 0, 3, 'OPTIONAL_OBLIGATE', ''),
]
with sqlite3.connect(results_db_path2) as db:
cursor = db.cursor()
cursor.execute(
Expand Down