Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions generate_const_intronsExons_gtf.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,23 @@ out$gene_id <- paste(out$gene_id, out$gene_name, sep = '_')

seqlevelsStyle(out) <- args[2]

fpath <- head(strsplit(args[1], '/')[[1]], -1) %>% paste(., collapse = '/')
fname <- paste0(tail(strsplit(args[1], '/')[[1]], 1), '_', Sys.Date(), '_constExonsAndIntrons_', args[2], '.gtf')
# If the third argument ends with .gtf, use as output path, else construct filename as before
if (grepl("\\.gtf$", args[3])) {
out_path <- args[3]
} else {
fpath <- args[3]
fname <- paste0(
tail(strsplit(args[1], '/')[[1]], 1),
'_', Sys.Date(),
'_constExonsAndIntrons_',
args[2], '.gtf'
)
out_path <- file.path(fpath, fname)
}

export(
object = out[ , -2], # omit gene name
con = file.path(args[3], fname)
con = out_path
)

w <- warnings()
Expand Down
243 changes: 243 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
# # CRIES runner
# ## Counting Reads for Intronic and Exonic Segments

# Requirements
# micromamba env create -n cries r-base=4.2.0 r-data.table r-magrittr bioconda::bioconductor-rtracklayer bioconda::hisat2
# pip install RSeQC
# micromamba install -c bioconda bedops gffread htseq
# conda activate cries

## param.s
input_path=None #';'.join([f'/path/to/{k}.fastq.gz' for k in [id1,id2]]) # fastqs
output_path=None #'path/to.json'

ensembl_release=112
method_count='featureCounts'
temp_dir_path='~/scratch'
cpus=10
kws={}

## for CLI
import argparse
# Create the parser
parser = argparse.ArgumentParser(description="A script with --pms parameter")

# Add --pms argument
parser.add_argument('--input-path', type=str, required=True, help="in path")
parser.add_argument('--output-path', type=str, required=True, help="out path")
parser.add_argument('--kws', type=str, required=False, default=kws, help="kws")

# Parse the arguments
args = parser.parse_args()

input_path=args.input_path
output_path=args.output_path

## packages
import os
from pathlib import Path
from glob import glob

## derived paths
fastq_path=input_path
del input_path

temp_dir_path=Path(temp_dir_path).expanduser().as_posix()
output_path=Path(output_path).expanduser().as_posix()

output_dir_path=Path(output_path).with_suffix('').as_posix()
Path(output_dir_path).mkdir(parents=True,exist_ok=True)

annots_gtf_path=f"{temp_dir_path}/ftp.ensembl.org/pub/release-{ensembl_release}/gtf/homo_sapiens/Homo_sapiens.GRCh38.{ensembl_release}.gtf"
fa_path=f"{temp_dir_path}/ftp.ensembl.org/pub/release-{ensembl_release}/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa"

annots_bed_path=f"{temp_dir_path}/ftp.ensembl.org/pub/release-{ensembl_release}/gtf/homo_sapiens/Homo_sapiens.GRCh38.{ensembl_release}_for_gtf2bed.bed"

annots_flt_path=f"{Path(annots_gtf_path).with_suffix('')}_constExonsAndIntrons_ensembl.gtf"
hisat2_index_path=f"{Path(fa_path).with_suffix('')}_hisat2_index"

if not Path(annots_flt_path).exists():
# 2. Download GTF annotation file if not present
if not Path(annots_gtf_path).exists():
print("Downloading annots_gtf_path...")
os.system(f"wget -nc -x {annots_gtf_path.replace(temp_dir_path,'https:/')}.gz -P {temp_dir_path};[ ! -f {annots_gtf_path} ] && gunzip {annots_gtf_path}.gz")
assert Path(annots_gtf_path).exists(), annots_gtf_path

# 4. Run the R script
com=f"Rscript generate_const_intronsExons_gtf.R {annots_gtf_path} ensembl {annots_flt_path}"
print(com)
os.system(com)
assert Path(annots_flt_path).exists(), annots_flt_path

if not Path(annots_bed_path).exists():
## gtf2bed compatible gtf
com=f"gffread {annots_gtf_path} -T -o {Path(annots_bed_path).with_suffix('.gtf')}"
print(com)
os.system(com)
com=f"gtf2bed < {Path(annots_bed_path).with_suffix('.gtf')} > {annots_bed_path}"
print(com)
os.system(com)
assert Path(annots_bed_path).exists(), annots_bed_path

if len(glob(f"{hisat2_index_path}*.ht2"))==0:
if not Path(fa_path).exists():
com=f"wget -nc -x {fa_path.replace(temp_dir_path,'https:/')}.gz -P {temp_dir_path};[ ! -f {fa_path} ] && gunzip {fa_path}.gz"
print(com)
os.system(com)
assert Path(fa_path).exists(), fa_path

com=f"hisat2-build -p {cpus} {fa_path} {hisat2_index_path}"
print(com)
os.system(com)
print(glob(f"{hisat2_index_path}*.ht2"))
assert len(glob(f"{hisat2_index_path}*.ht2"))>0, f"{hisat2_index_path}*.ht2"
# ## Step 2. Mapping reads
# We use HISAT2 for mapping RNA-seq reads, keeping only alignments with MAPQ score ≥30.

# filters out unmapped reads, secondary alignments, failed QC reads, and duplicates.
paired=';' in fastq_path
if paired:
fastq_r1_path,fastq_r2_path=fastq_path.split(';')
del fastq_path
com_arg=f" -1 {fastq_r1_path} -2 {fastq_r2_path} "
else:
com_arg=f"-U {fastq_path}"

# bam_path=Path(output_path).with_suffix('.sorted.flt.bam').as_posix()
bam_path=f"{output_dir_path}/possorted.flt.bam"

if not Path(bam_path).exists():
Path(f"{temp_dir_path}/tmp").mkdir(parents=True,exist_ok=True)

com=f"hisat2 -t -p {cpus} -x {hisat2_index_path} {com_arg} --summary-file {bam_path}.log | samtools sort -T {temp_dir_path}/tmp - | samtools view -b -F 1548 -q 30 -o {bam_path} -"
print(com)
os.system(com)
assert Path(bam_path).exists(), bam_path

# # ## Step 3. Counting reads that map to intronic or exonic segments of each gene

## inferring the strandedness
# The htseq-count parameter `<strand>` depends on the strandedness of the library preparation kit. For Illumina TruSeq RNA Library Prep Kit, the correct parameter is often `reverse`. If not sure, try all the three different options `no`, `yes` and `reverse`, and decide accordingly.

import re
import sys

def get_htseq_strand_argument(
infer_output_text: str,
stranded_threshold: float = 0.75
) -> str:
"""
Parses the text output of RSeQC's infer_experiment.py to determine the
correct strand argument for htseq-count.

Args:
infer_output_text: The full string output from infer_experiment.py.
stranded_threshold: The fraction required to consider the library stranded.
Defaults to 0.75.

Returns:
A string, either 'reverse', 'yes', or 'no'.
"""
if not '\n' in infer_output_text:
# path
infer_output_text=open(infer_output_text,'r').read()

forward_fraction = 0.0
reverse_fraction = 0.0

# Pattern to find the fraction for stranded-forward reads ("1++,1--,2+-,2-+")
forward_match = re.search(r"explained by \"1\+\+,1--,2\+-,2-\+\": ([\d\.]+)", infer_output_text)
if forward_match:
forward_fraction = float(forward_match.group(1))

# Pattern to find the fraction for stranded-reverse reads ("1+-,1-+,2++,2--")
reverse_match = re.search(r"explained by \"1\+-,1-\+,2\+\+,2--\": ([\d\.]+)", infer_output_text)
if reverse_match:
reverse_fraction = float(reverse_match.group(1))

# Handle cases where the output format might be unexpected
if not forward_match and not reverse_match:
# print("Warning: Could not parse RSeQC output. Defaulting to 'no'.", file=sys.stderr)
return 'no'

print(f"for:{forward_fraction},rev:{reverse_fraction}")
# Determine strandedness based on the fractions
if reverse_fraction > stranded_threshold:
return 'reverse'
elif forward_fraction > stranded_threshold:
return 'yes'
else:
return 'no'

# # Example 1: Stranded Reverse library
# output_reverse = """
# This is PairEnd Data
# Fraction of reads failed to determine: 0.0050
# Fraction of reads explained by "1++,1--,2+-,2-+": 0.0080
# Fraction of reads explained by "1+-,1-+,2++,2--": 0.9870
# """

# # Example 2: Stranded Forward library
# output_forward = """
# This is PairEnd Data
# Fraction of reads failed to determine: 0.0045
# Fraction of reads explained by "1++,1--,2+-,2-+": 0.9915
# Fraction of reads explained by "1+-,1-+,2++,2--": 0.0040
# """

# # Example 3: Unstranded library
# output_unstranded = """
# This is PairEnd Data
# Fraction of reads failed to determine: 0.0100
# Fraction of reads explained by "1++,1--,2+-,2-+": 0.4900
# Fraction of reads explained by "1+-,1-+,2++,2--": 0.5000
# """

# # Get the argument for each example
# strand_arg_1 = get_htseq_strand_argument(output_reverse)
# strand_arg_2 = get_htseq_strand_argument(output_forward)
# strand_arg_3 = get_htseq_strand_argument(output_unstranded)

strand_path=Path(bam_path).with_suffix('.strandedness').as_posix()
if not Path(strand_path).exists():
com=f"infer_experiment.py -r {annots_bed_path} -i {bam_path} > {strand_path}"
print(com)
os.system(com)
strand=get_htseq_strand_argument(strand_path)
print(strand)

# We use HTSeq-count for counting reads. For counting exonic reads, we run the HTSeq-count using the "intersection-strict" mode, to ensure that the reads that are counted entirely fit within the mature mRNA sequence, and do not overlap alternatively spliced exons. For intronic reads, we run the HTSeq-count using the "union" mode, since any read that even partially overlaps an intronic region likely originates from pre-mature RNA.

for feat_type in ['exon','intron']:
outp=Path(bam_path).with_suffix(f'.{feat_type}_counts.txt')

if not Path(outp).exists():

if method_count=='htseq':
com=f"htseq-count --type {feat_type} -m {'intersection-strict' if feat_type=='exon' else 'union'} -f bam -r pos --stranded {strand} {bam_path} {annots_flt_path} > {outp}"
# note: for PE data, divide the counts in the output file by 2.
else:
strand_value={'no':0, 'yes':1, 'reverse':2}[strand]
com=f"featureCounts {'-p' if paired else ''} -t {feat_type} {'--fracOverlap 1.0' if feat_type=='exon' else ''} -g gene_id -s {strand_value} -a {annots_flt_path} -o {outp} -T {cpus} {bam_path} "
print(com)
os.system(com)
assert Path(outp).exists(), outp

# ## Step 4. Normalization

# We generally use our package [REMBRANDTS](https://github.com/csglab/REMBRANDTS) for read count normalization, gene filtering, and removing bias in order to obtain stability measurements. REMBRANDTS internally uses DESeq, and performs linear regression of Δexon–Δintron vs. Δintron to obtain residuals that correspond to unbiased estimates of stability.

# In case you do not want to use REMBRANDTS, you can use variance-stabilized transformation of DESeq or DESeq2 for read count normalization (for each set of intronic and exonic reads separately), and then take Δexon–Δintron between any two samples as the measure of differential stability. However, note that this measure can be biased, overestimating the stability of transcriptionally down-regulated genes and underestimating the stability of transcriptionally up-regulated genes, as discussed in this paper: [Alkallas et al., Nat Commun, 8:909](https://www.nature.com/articles/s41467-017-00867-z).

## Outputs
### paths of the outputs
import json
with open(output_path, 'w') as file:
# indent=4 makes the output readable, similar to YAML
json.dump(
dict(
bam_path=bam_path,
),
file,
indent=4
)