From d377629065d32541ca0f6f6fa8b6e722cc52748c Mon Sep 17 00:00:00 2001 From: rraadd88 Date: Wed, 25 Feb 2026 21:41:21 +0000 Subject: [PATCH] + CLI access via run.py to run the full full workflow --- generate_const_intronsExons_gtf.R | 17 ++- run.py | 243 ++++++++++++++++++++++++++++++ 2 files changed, 257 insertions(+), 3 deletions(-) create mode 100644 run.py diff --git a/generate_const_intronsExons_gtf.R b/generate_const_intronsExons_gtf.R index 76763b5..8019450 100644 --- a/generate_const_intronsExons_gtf.R +++ b/generate_const_intronsExons_gtf.R @@ -108,12 +108,23 @@ out$gene_id <- paste(out$gene_id, out$gene_name, sep = '_') seqlevelsStyle(out) <- args[2] -fpath <- head(strsplit(args[1], '/')[[1]], -1) %>% paste(., collapse = '/') -fname <- paste0(tail(strsplit(args[1], '/')[[1]], 1), '_', Sys.Date(), '_constExonsAndIntrons_', args[2], '.gtf') +# If the third argument ends with .gtf, use as output path, else construct filename as before +if (grepl("\\.gtf$", args[3])) { + out_path <- args[3] +} else { + fpath <- args[3] + fname <- paste0( + tail(strsplit(args[1], '/')[[1]], 1), + '_', Sys.Date(), + '_constExonsAndIntrons_', + args[2], '.gtf' + ) + out_path <- file.path(fpath, fname) +} export( object = out[ , -2], # omit gene name - con = file.path(args[3], fname) + con = out_path ) w <- warnings() diff --git a/run.py b/run.py new file mode 100644 index 0000000..708abfc --- /dev/null +++ b/run.py @@ -0,0 +1,243 @@ +# # CRIES runner +# ## Counting Reads for Intronic and Exonic Segments + +# Requirements +# micromamba env create -n cries r-base=4.2.0 r-data.table r-magrittr bioconda::bioconductor-rtracklayer bioconda::hisat2 +# pip install RSeQC +# micromamba install -c bioconda bedops gffread htseq +# conda activate cries + +## param.s +input_path=None #';'.join([f'/path/to/{k}.fastq.gz' for k in [id1,id2]]) # fastqs +output_path=None #'path/to.json' + +ensembl_release=112 +method_count='featureCounts' +temp_dir_path='~/scratch' +cpus=10 +kws={} + +## for CLI +import argparse +# Create the parser +parser = argparse.ArgumentParser(description="A script with --pms parameter") + +# Add --pms argument +parser.add_argument('--input-path', type=str, required=True, help="in path") +parser.add_argument('--output-path', type=str, required=True, help="out path") +parser.add_argument('--kws', type=str, required=False, default=kws, help="kws") + +# Parse the arguments +args = parser.parse_args() + +input_path=args.input_path +output_path=args.output_path + +## packages +import os +from pathlib import Path +from glob import glob + +## derived paths +fastq_path=input_path +del input_path + +temp_dir_path=Path(temp_dir_path).expanduser().as_posix() +output_path=Path(output_path).expanduser().as_posix() + +output_dir_path=Path(output_path).with_suffix('').as_posix() +Path(output_dir_path).mkdir(parents=True,exist_ok=True) + +annots_gtf_path=f"{temp_dir_path}/ftp.ensembl.org/pub/release-{ensembl_release}/gtf/homo_sapiens/Homo_sapiens.GRCh38.{ensembl_release}.gtf" +fa_path=f"{temp_dir_path}/ftp.ensembl.org/pub/release-{ensembl_release}/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa" + +annots_bed_path=f"{temp_dir_path}/ftp.ensembl.org/pub/release-{ensembl_release}/gtf/homo_sapiens/Homo_sapiens.GRCh38.{ensembl_release}_for_gtf2bed.bed" + +annots_flt_path=f"{Path(annots_gtf_path).with_suffix('')}_constExonsAndIntrons_ensembl.gtf" +hisat2_index_path=f"{Path(fa_path).with_suffix('')}_hisat2_index" + +if not Path(annots_flt_path).exists(): + # 2. Download GTF annotation file if not present + if not Path(annots_gtf_path).exists(): + print("Downloading annots_gtf_path...") + os.system(f"wget -nc -x {annots_gtf_path.replace(temp_dir_path,'https:/')}.gz -P {temp_dir_path};[ ! -f {annots_gtf_path} ] && gunzip {annots_gtf_path}.gz") + assert Path(annots_gtf_path).exists(), annots_gtf_path + + # 4. Run the R script + com=f"Rscript generate_const_intronsExons_gtf.R {annots_gtf_path} ensembl {annots_flt_path}" + print(com) + os.system(com) +assert Path(annots_flt_path).exists(), annots_flt_path + +if not Path(annots_bed_path).exists(): + ## gtf2bed compatible gtf + com=f"gffread {annots_gtf_path} -T -o {Path(annots_bed_path).with_suffix('.gtf')}" + print(com) + os.system(com) + com=f"gtf2bed < {Path(annots_bed_path).with_suffix('.gtf')} > {annots_bed_path}" + print(com) + os.system(com) +assert Path(annots_bed_path).exists(), annots_bed_path + +if len(glob(f"{hisat2_index_path}*.ht2"))==0: + if not Path(fa_path).exists(): + com=f"wget -nc -x {fa_path.replace(temp_dir_path,'https:/')}.gz -P {temp_dir_path};[ ! -f {fa_path} ] && gunzip {fa_path}.gz" + print(com) + os.system(com) + assert Path(fa_path).exists(), fa_path + + com=f"hisat2-build -p {cpus} {fa_path} {hisat2_index_path}" + print(com) + os.system(com) + print(glob(f"{hisat2_index_path}*.ht2")) +assert len(glob(f"{hisat2_index_path}*.ht2"))>0, f"{hisat2_index_path}*.ht2" +# ## Step 2. Mapping reads +# We use HISAT2 for mapping RNA-seq reads, keeping only alignments with MAPQ score ≥30. + +# filters out unmapped reads, secondary alignments, failed QC reads, and duplicates. +paired=';' in fastq_path +if paired: + fastq_r1_path,fastq_r2_path=fastq_path.split(';') + del fastq_path + com_arg=f" -1 {fastq_r1_path} -2 {fastq_r2_path} " +else: + com_arg=f"-U {fastq_path}" + +# bam_path=Path(output_path).with_suffix('.sorted.flt.bam').as_posix() +bam_path=f"{output_dir_path}/possorted.flt.bam" + +if not Path(bam_path).exists(): + Path(f"{temp_dir_path}/tmp").mkdir(parents=True,exist_ok=True) + + com=f"hisat2 -t -p {cpus} -x {hisat2_index_path} {com_arg} --summary-file {bam_path}.log | samtools sort -T {temp_dir_path}/tmp - | samtools view -b -F 1548 -q 30 -o {bam_path} -" + print(com) + os.system(com) +assert Path(bam_path).exists(), bam_path + +# # ## Step 3. Counting reads that map to intronic or exonic segments of each gene + +## inferring the strandedness +# The htseq-count parameter `` depends on the strandedness of the library preparation kit. For Illumina TruSeq RNA Library Prep Kit, the correct parameter is often `reverse`. If not sure, try all the three different options `no`, `yes` and `reverse`, and decide accordingly. + +import re +import sys + +def get_htseq_strand_argument( + infer_output_text: str, + stranded_threshold: float = 0.75 + ) -> str: + """ + Parses the text output of RSeQC's infer_experiment.py to determine the + correct strand argument for htseq-count. + + Args: + infer_output_text: The full string output from infer_experiment.py. + stranded_threshold: The fraction required to consider the library stranded. + Defaults to 0.75. + + Returns: + A string, either 'reverse', 'yes', or 'no'. + """ + if not '\n' in infer_output_text: + # path + infer_output_text=open(infer_output_text,'r').read() + + forward_fraction = 0.0 + reverse_fraction = 0.0 + + # Pattern to find the fraction for stranded-forward reads ("1++,1--,2+-,2-+") + forward_match = re.search(r"explained by \"1\+\+,1--,2\+-,2-\+\": ([\d\.]+)", infer_output_text) + if forward_match: + forward_fraction = float(forward_match.group(1)) + + # Pattern to find the fraction for stranded-reverse reads ("1+-,1-+,2++,2--") + reverse_match = re.search(r"explained by \"1\+-,1-\+,2\+\+,2--\": ([\d\.]+)", infer_output_text) + if reverse_match: + reverse_fraction = float(reverse_match.group(1)) + + # Handle cases where the output format might be unexpected + if not forward_match and not reverse_match: + # print("Warning: Could not parse RSeQC output. Defaulting to 'no'.", file=sys.stderr) + return 'no' + + print(f"for:{forward_fraction},rev:{reverse_fraction}") + # Determine strandedness based on the fractions + if reverse_fraction > stranded_threshold: + return 'reverse' + elif forward_fraction > stranded_threshold: + return 'yes' + else: + return 'no' + +# # Example 1: Stranded Reverse library +# output_reverse = """ +# This is PairEnd Data +# Fraction of reads failed to determine: 0.0050 +# Fraction of reads explained by "1++,1--,2+-,2-+": 0.0080 +# Fraction of reads explained by "1+-,1-+,2++,2--": 0.9870 +# """ + +# # Example 2: Stranded Forward library +# output_forward = """ +# This is PairEnd Data +# Fraction of reads failed to determine: 0.0045 +# Fraction of reads explained by "1++,1--,2+-,2-+": 0.9915 +# Fraction of reads explained by "1+-,1-+,2++,2--": 0.0040 +# """ + +# # Example 3: Unstranded library +# output_unstranded = """ +# This is PairEnd Data +# Fraction of reads failed to determine: 0.0100 +# Fraction of reads explained by "1++,1--,2+-,2-+": 0.4900 +# Fraction of reads explained by "1+-,1-+,2++,2--": 0.5000 +# """ + +# # Get the argument for each example +# strand_arg_1 = get_htseq_strand_argument(output_reverse) +# strand_arg_2 = get_htseq_strand_argument(output_forward) +# strand_arg_3 = get_htseq_strand_argument(output_unstranded) + +strand_path=Path(bam_path).with_suffix('.strandedness').as_posix() +if not Path(strand_path).exists(): + com=f"infer_experiment.py -r {annots_bed_path} -i {bam_path} > {strand_path}" + print(com) + os.system(com) +strand=get_htseq_strand_argument(strand_path) +print(strand) + +# We use HTSeq-count for counting reads. For counting exonic reads, we run the HTSeq-count using the "intersection-strict" mode, to ensure that the reads that are counted entirely fit within the mature mRNA sequence, and do not overlap alternatively spliced exons. For intronic reads, we run the HTSeq-count using the "union" mode, since any read that even partially overlaps an intronic region likely originates from pre-mature RNA. + +for feat_type in ['exon','intron']: + outp=Path(bam_path).with_suffix(f'.{feat_type}_counts.txt') + + if not Path(outp).exists(): + + if method_count=='htseq': + com=f"htseq-count --type {feat_type} -m {'intersection-strict' if feat_type=='exon' else 'union'} -f bam -r pos --stranded {strand} {bam_path} {annots_flt_path} > {outp}" + # note: for PE data, divide the counts in the output file by 2. + else: + strand_value={'no':0, 'yes':1, 'reverse':2}[strand] + com=f"featureCounts {'-p' if paired else ''} -t {feat_type} {'--fracOverlap 1.0' if feat_type=='exon' else ''} -g gene_id -s {strand_value} -a {annots_flt_path} -o {outp} -T {cpus} {bam_path} " + print(com) + os.system(com) + assert Path(outp).exists(), outp + +# ## Step 4. Normalization + +# We generally use our package [REMBRANDTS](https://github.com/csglab/REMBRANDTS) for read count normalization, gene filtering, and removing bias in order to obtain stability measurements. REMBRANDTS internally uses DESeq, and performs linear regression of Δexon–Δintron vs. Δintron to obtain residuals that correspond to unbiased estimates of stability. + +# In case you do not want to use REMBRANDTS, you can use variance-stabilized transformation of DESeq or DESeq2 for read count normalization (for each set of intronic and exonic reads separately), and then take Δexon–Δintron between any two samples as the measure of differential stability. However, note that this measure can be biased, overestimating the stability of transcriptionally down-regulated genes and underestimating the stability of transcriptionally up-regulated genes, as discussed in this paper: [Alkallas et al., Nat Commun, 8:909](https://www.nature.com/articles/s41467-017-00867-z). + +## Outputs +### paths of the outputs +import json +with open(output_path, 'w') as file: + # indent=4 makes the output readable, similar to YAML + json.dump( + dict( + bam_path=bam_path, + ), + file, + indent=4 + )