From 6372eadc752b86a1c64cd81d80f16af2b860cf31 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 9 Mar 2026 16:03:35 +0000 Subject: [PATCH 1/2] Initial plan From cae70bada2b9afb7665a4a3fb8c4f14b094280db Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 9 Mar 2026 16:14:08 +0000 Subject: [PATCH 2/2] Improve documentation and code comments for bite calculation Co-authored-by: carolinamonzo <24670602+carolinamonzo@users.noreply.github.com> --- docs/Understanding-the-output-of-SQANTI3-QC.md | 4 ++-- src/classification_steps.py | 9 ++++++++- src/qc_computations.py | 2 ++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/Understanding-the-output-of-SQANTI3-QC.md b/docs/Understanding-the-output-of-SQANTI3-QC.md index 48bc91fa..b7484d82 100644 --- a/docs/Understanding-the-output-of-SQANTI3-QC.md +++ b/docs/Understanding-the-output-of-SQANTI3-QC.md @@ -153,7 +153,7 @@ The output `_classification.txt` has the following fields: 22. `FL` or `FL.`: FL count associated with this isoform per sample if `--fl_count` is provided, otherwise NA. 23. `n_indels`: total number of indels based on alignment. 24. `n_indels_junc`: number of junctions in this isoform that have alignment indels near the junction site (indicating potentially unreliable junctions). -25. `bite`: TRUE if contains at least one "bite" positive SJ. +25. `bite`: TRUE if any junction in the isoform is "bite" positive (i.e., the novel intron extends past the nearest annotated splice sites on both ends, overlapping adjacent annotated exons). This is calculated from the `bite_junction` field in the junction output file: if any junction has `bite_junction == TRUE`, the isoform `bite` is TRUE. Isoforms with no junctions (mono-exonic) retain the default value of NA. See also the `bite_junction` field in the junction file glossary below. 26. `iso_exp`: short read expression for this isoform if `--expression` is provided, otherwise NA. 27. `gene_exp`: short read expression for the gene associated with this isoform (summing over all isoforms) if `--expression` is provided, otherwise NA. 28. `ratio_exp`: ratio of `iso_exp` to `gene_exp` if `--expression` is provided, otherwise NA. @@ -197,7 +197,7 @@ The `_junctions.txt` file contains the following columns: 10. `end_site_category`: `known` if the junction end site is annotated. If on - strand, this is actually the acceptor site. 11. `diff_to_Ref_start_site`: distance to closest annotated junction start site. If on - strand, this is actually the donor site. 12. `diff_to_Ref_end_site`: distance to closest annotated junction end site. If on - strand, this is actually the acceptor site. -13. `bite_junction`: Applies only to novel splice junctions. If the novel intron partially overlaps annotated exons the bite value is TRUE, otherwise it is FALSE. +13. `bite_junction`: TRUE if the novel junction's intron extends past the annotated splice sites on both ends (i.e., the novel donor is at or upstream of the closest reference donor AND the novel acceptor is at or downstream of the closest reference acceptor, with at least one being strictly past the reference position). This indicates that the novel intron "bites into" the adjacent annotated exons. Calculated from `diff_to_Ref_start_site` ≤ 0 and `diff_to_Ref_end_site` ≤ 0 with at least one being strictly negative. Known junctions (where `junction_category` is `known`) always have `bite_junction = FALSE`. 14. `splice_site`: Splice motif. 15. `RTS_junction`: TRUE if junction is predicted to a template switching artifact. 16. `indel_near_junct`: TRUE if there is alignment indel error near the junction site, indicating potential junction incorrectness. diff --git a/src/classification_steps.py b/src/classification_steps.py index 2b116dc7..545fc7c6 100644 --- a/src/classification_steps.py +++ b/src/classification_steps.py @@ -143,8 +143,10 @@ def write_junction_info(trec, junctions_by_chr, accepted_canonical_sites, indelI min_diff_s = min_diff_e = 0 else: # Find the closest junction start site + # min_diff_s = d - closest_donor: negative if query donor is upstream of (inside) the adjacent exon min_diff_s = -find_closest_in_list(junctions_by_chr[trec.chrom]['donors'], d) - # find the closest junction end site + # Find the closest junction end site + # min_diff_e = closest_acceptor - a: negative if query acceptor is downstream of (inside) the adjacent exon min_diff_e = find_closest_in_list(junctions_by_chr[trec.chrom]['acceptors'], a) else: @@ -184,6 +186,11 @@ def write_junction_info(trec, junctions_by_chr, accepted_canonical_sites, indelI "end_site_category": "known" if min_diff_e==0 else "novel", "diff_to_Ref_start_site": min_diff_s if min_diff_s==min_diff_s else "NA", # check if min_diff is actually nan "diff_to_Ref_end_site": min_diff_e if min_diff_e==min_diff_e else "NA", # check if min_diff is actually nan + # min_diff_s = d - closest_donor: negative means query donor is upstream of (inside) the reference exon + # min_diff_e = closest_acceptor - a: negative means query acceptor is downstream of (inside) the reference exon + # bite_junction is TRUE when the novel intron extends past the reference junction on both ends + # (i.e., min_diff_s <= 0 AND min_diff_e <= 0, with at least one being strictly negative), + # meaning the novel intron "bites into" the adjacent annotated exons. "bite_junction": "TRUE" if ((min_diff_s<0 or min_diff_e<0) and not(min_diff_s>0 or min_diff_e>0)) else "FALSE", "splice_site": splice_site, "canonical": "canonical" if splice_site in accepted_canonical_sites else "non_canonical", diff --git a/src/qc_computations.py b/src/qc_computations.py index f92ea39c..91e96c86 100644 --- a/src/qc_computations.py +++ b/src/qc_computations.py @@ -141,6 +141,8 @@ def isoforms_junctions(isoforms_info, reader): (r['canonical'] == 'non_canonical'): isoforms_info[r['isoform']].canonical = r['canonical'] + # bite: isoform is TRUE if any junction has bite_junction == TRUE + # Once set to TRUE it stays TRUE; if still 'NA' (first junction), set to whatever bite_junction is if (isoforms_info[r['isoform']].bite == 'NA') or (r['bite_junction'] == 'TRUE'): isoforms_info[r['isoform']].bite = r['bite_junction']