Skip to content

aryeelab/methyl-microc

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

15 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Methyl-MicroC Analysis Environment

This repository contains the methyl-microc pipeline using nf-core/methylseq.

Prerequisites

Before setting up this environment, ensure you have:

  • Conda/Miniconda available in your shell environment
  • Java 17 or later (required for Nextflow)

Quick Setup

Step 1: Create Conda Environment

conda create -n methyl-microc -c conda-forge -c bioconda \
  python=3.10.19 nextflow=25.10.2 nf-core=3.5.1 -y

Step 2: Activate Environment and Download Pipeline

conda activate methyl-microc

# Download nf-core/methylseq pipeline
nf-core pipelines download methylseq --revision 4.1.0

Step 3: Optional: Install tools for manual steps

# This is separate from the per-process conda envs created by Nextflow when you run with `-profile conda`.

conda install -c conda-forge -c bioconda \
  bwa=0.7.19 \
  bwameth=0.2.9 \
  samtools=1.23 \
  bedtools=2.31.1 \
  picard=3.4.0 \
  qualimap=2.3 \
  methyldackel=0.6.1 \
  hisat2=2.2.1 \
  minimap2=2.30 \
  trim-galore=0.6.10 \
  fastqc=0.12.1 \
  cutadapt=5.1 \
  pigz=2.8 \
  pbzip2=1.1.13 \
  multiqc=1.30 \
  pairtools=1.1.3 \
  pairix=0.3.9 \
  pysam=0.23.0 \
  -y

# Python dependency for bigWig conversion scripts
python -m pip install pyBigWig==0.3.25

Step 4: Reference Genome Setup

Quick test with chromosome 22 only:

# For faster testing, download just chromosome 22 (~1.5MB)
mkdir -p references/chr22
cd references/chr22
curl -L -o chr22.fa.gz "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr22.fa.gz"
gunzip chr22.fa.gz
cd ../..

Full GRCh38 Reference Genome:

# Create reference directory
mkdir -p references/GRCh38
cd references/GRCh38

# Download GRCh38 reference genome (245MB compressed, ~800MB uncompressed)
curl -L -C - -o GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz \
  'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz'

# Extract the reference genome
gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
cd ../..

Usage

Running the Pipeline

The pipeline is designed to be run using the provided script:

# Activate environment
conda activate methyl-microc

# Run the methylseq pipeline on a small test sample
time ./run_methylseq.sh \
    -profile conda \
    --input tests/samplesheet.csv \
    --outdir results \
    --fasta $PWD/references/chr22/chr22.fa \
    --aligner bwameth

# Run the methylseq pipeline on Arsh's HCT116 samples
time ./run_methylseq.sh \
    -profile conda \
    --input data/20250612_hct116/samplesheet_20250612_hct116.csv \
    --outdir results/20250612_hct116 \
    --fasta $PWD/references/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
    --aligner bwameth    


Note that there are several steps below that are not (yet) part of the pipeline and will need to be added. For now they are done manually.

# [ADD TO PIPELINE]
# Convert methylation track bedgraphs to bigwig 
samtools faidx references/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

cat results/20250612_hct116/methyldackel/HCT116_Meth_MicroC.markdup.sorted_CpG.bedGraph | grep -v KI | grep -v GL | grep -v random > tmp.bedGraph
time python bin/bedgraph_to_bigwig.py tmp.bedGraph results/20250612_hct116/methyldackel/HCT116_Meth_MicroC.markdup.sorted_CpG.bw references/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai

Parse pairs from the bam. (i.e. infer fragments and ligation junctions from the reads)

# [ADD TO PIPELINE]
conda create -n pairtools -c conda-forge -c bioconda \
  python=3.10.19 \
  pairtools=1.1.3 \
  pysam=0.23.0 \
  -y
# NOTE: On Mac OS (arm64) `pairtools parse --add-columns ... seq` crashes
# with a bus error for some newer pysam builds. A known-good pin is `pysam=0.23.0`.

# Test sample
CHROM_SIZES="references/chr22/chr22.fa.fai"
BAM="results/bwameth/deduplicated/test_sample.markdup.sorted.bam"
PAIRS="results/pairs/test_sample.pairs.gz"
STATS="results/pairs/test_sample.stats.txt" 
mkdir -p results/pairs

# HCT116
CHROM_SIZES="references/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.fai"
BAM="results/20250612_hct116/bwameth/deduplicated/HCT116_Meth_MicroC.markdup.sorted.bam"
PAIRS="results/20250612_hct116/pairs/HCT116_Meth_MicroC.pairs.gz"
STATS="results/20250612_hct116/pairs/HCT116_Meth_MicroC.stats.txt"
mkdir -p results/20250612_hct116/pairs

conda activate pairtools
time pairtools parse --min-mapq 30 --walks-policy 5unique \
        --max-inter-align-gap 30 --drop-sam --add-columns pos5,pos3,cigar,seq \
        --nproc-in 8 --nproc-out 8 --chroms-path ${CHROM_SIZES} \
        $BAM | \
        pairtools sort --nproc 4  | \
        pairtools dedup -o $PAIRS --output-stats $STATS

Annotate pairs with methylation information

# Append per-fragment methylation strings (meth1/meth2) using the reference FASTA
# (requires the FASTA to be indexed: samtools faidx reference.fa)
# NOTE: the annotator requires the `cigar{1,2}` and `seq{1,2}` columns added by pairtools parse.
#
# Test sample
# time python bin/annotate_pairs_methylation.py \
#     --fasta references/chr22/chr22.fa \
#     --input  $PAIRS \
#     --output results/pairs/test_sample.meth.pairs.gz


# Test sample
METH_PAIRS="results/pairs/test_sample.meth.pairs.gz"
REF="references/chr22/chr22.fa"

# HCT116
METH_PAIRS="results/20250612_hct116/pairs/HCT116_Meth_MicroC.meth.pairs.gz"
REF="references/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"

time python bin/annotate_pairs_methylation.py \
    --fasta "${REF}" \
    --input  $PAIRS \
    --output "${METH_PAIRS}"


# Validate methylation annotation on the first pair
#
# This sanity check demonstrates that meth1/meth2 are correctly generated.
# It:
#   1) extracts the first (non-header) pair from results/pairs/test_sample.meth.pairs.gz
#   2) fetches the reference sequence spanning pos5..pos3 (inclusive) from the FASTA
#   3) checks that len(meth) == abs(pos3-pos5)+1 and that CpG sites in the reference align
#      to CpG calls in the methylation string.
#
# Methylation encoding (per base of the fragment span):
#   '1' = methylated CpG
#   '0' = unmethylated CpG
#   '-' = not a CpG position in the reference
#   '.' = unclear / no-call
#
# Coordinate convention:
#   pairtools reports pos5 and pos3 for each side. The analyzed fragment span is the inclusive
#   region between these endpoints; its length is abs(pos3-pos5)+1.

PAIRS_METH="results/pairs/test_sample.meth.pairs.gz"
FASTA="references/chr22/chr22.fa"
python bin/validate_pairs_methylation.py --pairs "${PAIRS_METH}" --fasta "${FASTA}" --record 3

# Or validate a specific record by readID:
READ_ID="LH00547:129:233G5FLT3:8:1101:31194:1673"
python bin/validate_pairs_methylation.py --pairs "${PAIRS_METH}" --fasta "${FASTA}" --readID "${READ_ID}"

gunzip -c results/pairs/test_sample.meth.pairs.gz | head -n 10

# HCT116
#python bin/annotate_pairs_methylation.py \
#   --fasta references/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
#   --input  $pairs \
#   --output results/20250612_hct116/pairs/HCT116_Meth_MicroC.meth.pairs.gz

QC

# [ADD TO PIPELINE]
conda activate methyl-microc

# (Already included in Quick Setup Step 3, but repeated here for convenience)
conda install -c conda-forge -c bioconda multiqc=1.30 -y

# If you specifically need the open2c fork of MultiQC (e.g. for extra pairtools-related modules),
# install it via pip instead and skip the conda MultiQC install above:
# python -m pip install git+https://github.com/open2c/MultiQC.git

multiqc -f -o results/20250612_hct116/multiqc results/20250612_hct116/pairs 


cd stats

Create Cooler and Juicebox hic

# Create conda env for hictk
conda create -n hictk -c conda-forge -c bioconda python=3.10.19 hictk=2.2.0 -y

# [ADD TO PIPELINE]
#PAIRS="results/20250612_hct116/pairs/HCT116_Meth_MicroC_red_klnw.pairs.gz"
#HIC="results/20250612_hct116/hic/HCT116_Meth_MicroC_red_klnw.hic"

PAIRS="results/20250612_hct116/pairs/HCT116_Meth_MicroC.pairs.gz"
HIC="results/20250612_hct116/hic/HCT116_Meth_MicroC.hic"
COOLER="results/20250612_hct116/hic/HCT116_Meth_MicroC.cool"

conda activate hictk

time hictk load --format 4dn --bin-size 100kbp $PAIRS $HIC
time hictk load --format 4dn --bin-size 100kbp $PAIRS $COOLER

Input Format

Create a samplesheet.csv file with your samples:

sample,fastq_1,fastq_2
HCT116_Meth_MicroC,data/20250612_250528_HCT116_Meth_MicroC_AK13242_S27_L008_R1_001.fastq.gz,data/20250612_250528_HCT116_Meth_MicroC_AK13242_S27_L008_R2_001.fastq.gz
HCT116_Meth_MicroC_red_klnw,data/20250612_250528_HCT116_Meth_MicroC_red_klnw_AK13242_S28_L008_R1_001.fastq.gz,data/20250612_250528_HCT116_Meth_MicroC_red_klnw_AK13242_S28_L008_R2_001.fastq.gz

About

No description, website, or topics provided.

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published