Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified LICENSE
100644 → 100755
Empty file.
70 changes: 49 additions & 21 deletions MIRUReader.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,28 @@
from collections import Counter


#function to determine repeat number based on total number of mismatches in primer sequence
# Function that corrects the mode() function where it does not always return statistical error
def custom_mode(List):
counts = Counter(List)
max_count = max(counts.values())

modes = [key for key, value in counts.items() if value == max_count]

if len(modes) == 1:
return modes[0]
else:
raise statistics.StatisticsError

# Function to extract multiples modes
def modes(List):
counts = Counter(List)
max_count = max(counts.values())

modes = [key for key, value in counts.items() if value == max_count]
return modes


# Function to determine repeat number based on total number of mismatches in primer sequence
def chooseMode(name, table, CounterList):
maxcount = max(CounterList.values())
repeatToCheck = []
Expand Down Expand Up @@ -54,24 +75,33 @@ def chooseMode(name, table, CounterList):
MIRU_primers = script_dir + "/MIRU_primers"

parser = argparse.ArgumentParser()

main_group = parser.add_argument_group('Main options')

main_group.add_argument('-r', '--reads', required=True, help='input reads file in fastq/fasta format (required)')
main_group.add_argument('-p', '--prefix', required=True, help='sample ID (required)')
main_group.add_argument('--table', type=str, default=MIRU_table, help='allele calling table')
main_group.add_argument('--primers', type=str, default=MIRU_primers, help='primers sequences')

optional_group = parser.add_argument_group('Optional options')

optional_group.add_argument('--amplicons', help='provide output from primersearch and summarize MIRU profile directly', action='store_true')
optional_group.add_argument('--details', help='for inspection', action='store_true')
optional_group.add_argument('--mismatch', type=int, dest="mismatch", required=False, default=18, help="Allowed percent mismatch. Default: 18")
optional_group.add_argument('--nofasta', help='delete the fasta reads file generated if your reads are in fastq format', action='store_true')
optional_group.add_argument('--min_amplicons', type=int, dest='min_amplicons', required=False, default=3, help='Minimum number of amplicons required for a reliable result. Below this threshold, the program returns "Warning 1" for low coverage. Default: 3')
optional_group.add_argument('--freq', type=float, dest='freq', required=False, default=0.6, help='Minimum frequency required for a reliable result. Below this threshold, the program returns "Warning 2" for an unfixed allele. Default: 0.6')
# optional_group.add_argument('--amplicon_freq', type=int, dest='amplicon_freq', required=False, default=20, help='Number of amplicons required for reliable results with mixed alleles. Below this threshold, the program returns "Warning 2" for an unfixed allele. Default: 20')
optional_group.add_argument('--amplicon_mode', type=int, dest='amplicon_mode', required=False, default=10, help='Number of amplicons required for reliable results. Below this threshold, the program returns "Warning 3" for a possible polyclonal allele with low coverage. Default: 10')

args = parser.parse_args()

if not os.path.exists(args.reads):
sys.exit('Error: ' + args.reads + ' is not found!')

sample_prefix = args.prefix
sample_dir = os.path.dirname(os.path.abspath(args.reads))
mismatch_allowed = 18
psearchOut = sample_dir + '/' + sample_prefix + '.' + str(mismatch_allowed) + '.primersearch.out'
mismatch_allowed = args.mismatch
psearchOut = sample_dir + '/' + sample_prefix + '.' + str(args.mismatch) + '.primersearch.out'

df = pd.read_csv(MIRU_table, sep='\t')
df_0580 = pd.read_csv(MIRU_table_0580, sep='\t')
Expand Down Expand Up @@ -107,7 +137,7 @@ def chooseMode(name, table, CounterList):

if not args.amplicons:
try:
subprocess_args = ['primersearch', '-seqall', fastaReads, '-infile', args.primers, '-mismatchpercent', str(mismatch_allowed), '-outfile', psearchOut]
subprocess_args = ['primersearch', '-seqall', fastaReads, '-infile', args.primers, '-mismatchpercent', str(args.mismatch), '-outfile', psearchOut]
subprocess.call(subprocess_args)
except OSError:
print('OSError: primersearch command is not found.')
Expand Down Expand Up @@ -179,29 +209,27 @@ def chooseMode(name, table, CounterList):
repeats.setdefault(loci).append(0)
lookup.setdefault(primerID).append(0)

if args.details:
myLookUp = pd.DataFrame(columns=["loci", "hit_index", "repeat_no", "error_no"])
for key, value in lookup.items():
#example: lookup = {'0154_1':[2,4]} total no. of mismatches, repeat number
myLookUp = myLookUp.append({"loci":key.split("_")[0], "hit_index":int(key.split("_")[1]), "repeat_no":lookup[key][1], "error_no":lookup[key][0]}, ignore_index=True)
sortedLookUp = myLookUp.sort_values(by=["loci", "hit_index"])
print(sortedLookUp.to_csv(sep='\t', index=False))
for item in miru:
#array that used to determine repeat number
print(Counter(repeats[item]))

miru_repeats = pd.DataFrame(columns = ['sample_prefix'] + miru, index = range(1))
miru_repeats['sample_prefix'] = sample_prefix
for item in miru:
if repeats[item] != []:
try:
repeat = mode(repeats[item])
miru_repeats[item][0] = repeat
if len(repeats[item]) < args.min_amplicons:
repeat = f"{custom_mode(repeats[item])} (Warning 1: Low Coverage)"
# elif repeats[item].count(mode(repeats[item])) / len(repeats[item]) <= args.freq and len(repeats[item]) <= args.amplicon_freq: ## If you need to put some minimum number of amplicon for those unfixed alleles, uncomment this line and the flag corresponded.
elif repeats[item].count(mode(repeats[item])) / len(repeats[item]) <= args.freq:
repeat = f"{custom_mode(repeats[item])} (Warning 2: Unfixed allele)"
else:
repeat = custom_mode(repeats[item])
except statistics.StatisticsError:
repeat = chooseMode(item, lookup, Counter(repeats[item]))
miru_repeats[item][0] = repeat
if len(repeats[item]) < args.amplicon_mode:
repeat = f"{chooseMode(item, lookup, Counter(repeats[item]))} (Warning 3: Possible polyclonal {modes(repeats[item])}, Low Coverage)"
else:
repeat = f"{chooseMode(item, lookup, Counter(repeats[item]))} (Warning 4: Possible polyclonal {modes(repeats[item])})"
else:
miru_repeats[item][0] = "ND"
repeat = "ND"

miru_repeats[item][0] = repeat

if args.nofasta:
if ('.fastq' in args.reads) or ('.gz' in args.reads):
Expand Down
Empty file modified MIRU_primers
100644 → 100755
Empty file.
Empty file modified MIRU_table
100644 → 100755
Empty file.
Empty file modified MIRU_table_0580
100644 → 100755
Empty file.
94 changes: 61 additions & 33 deletions README.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,62 @@

## Description

Identify 24-locus MIRU-VNTR for *Mycobacterium tuberculosis* complex (MTBC) directly from long reads generated by Oxford Nanopore Technologies (ONT) and Pacific Biosciences (PacBio). Also work on assembled genome.
Identify 24-locus MIRU-VNTR for _Mycobacterium tuberculosis_ complex (MTBC) directly from long reads generated by Oxford Nanopore Technologies (ONT) and Pacific Biosciences (PacBio). Also work on assembled genome.

## Requirements

* Linux
* primersearch from [EMBOSS](http://emboss.sourceforge.net/download/)
* install from the official website or
* install via conda `conda install -c bioconda emboss`
* Ensure the primersearch command is in your device's environment path, where primersearch program can be executed directly by typing `primersearch` on the commandline
* [*pandas*](https://pandas.pydata.org/)
* can be installed via conda `conda install pandas` or via PyPI `pip install pandas`
* [*statistics*](https://pypi.org/project/statistics/)
* can be installed via PyPI `pip install statistics`
- Linux
- primersearch from [EMBOSS](http://emboss.sourceforge.net/download/)
- install from the official website or
- install via conda `conda install -c bioconda emboss`
- Ensure the primersearch command is in your device's environment path, where primersearch program can be executed directly by typing `primersearch` on the commandline
- [_pandas_](https://pandas.pydata.org/)
- can be installed via conda `conda install pandas` or via PyPI `pip install pandas`
- [_statistics_](https://pypi.org/project/statistics/)
- can be installed via PyPI `pip install statistics`

## Installation

`git clone https://github.com/phglab/MIRUReader.git`
`git clone https://github.com/phglab/MIRUReader.git` or `git clone https://github.com/MG-IiSGM/MIRUReader` (for this modified version)

## Change log

#### 03/07/2024

- Added different depth and frequency parameters to flag potential suboptimal alleles.
- Updated interpretation documentation to the README

#### 13/09/2019

- Added a check to ensure primersearch is executable prior to MIRUReader program execution
- Updated documentation to the README

#### 04/07/2019

- Update output format for option '--details'.

#### 14/06/2019

- Auto convert fastq to fasta.

## Usage example

For one sample analysis:

```
python /your/path/to/MIRUReader.py -r sample.fasta -p sampleID > miru.txt
```

For multiple samples analysis:

1. Create a mapping file (mappingFile.txt) that looks like:

sample_001.fasta sample_001 \
sample_002.fasta sample_002 \
...
sample_001.fasta sample_001 \
sample_002.fasta sample_002 \
...

2. Then run the program:

```
cat mappingFile.txt | while read -a line; do python /your/path/to/MIRUReader.py -r ${line[0]} -p ${line[1]}; done > miru.multiple.txt
```
Expand All @@ -58,31 +70,47 @@ sample_001 2 4 4 2 3 3 3 2
```

Notes:
* The program is compatible to Python 2 and Python 3.
* Accepted reads file format includes '.fastq', '.fastq.gz', '.fasta', and '.fasta.gz'.
* The program output is a tab-delimited plain text which can be copied to or opened in Excel spreadsheet.

## Full usage

| Main options | Description |
| ------------ | ----------- |
| -r READS | Input reads file in fastq/fasta format, can be gzipped or not gzipped |
| -p PREFIX | Sample ID required for naming output file. |
| --table TABLE | Allele calling table, default is MIRU_table. Can be user-defined in fixed format. However, providing custom allele calling table for other VNTR is not tested. |
| --primers PRIMERS | Primers sequences, default is MIRU_primers. Can be user-defined in fixed format. |
- The program is compatible to Python 2 and Python 3.
- Accepted reads file format includes '.fastq', '.fastq.gz', '.fasta', and '.fasta.gz'.
- The program output is a tab-delimited plain text which can be copied to or opened in Excel spreadsheet.

## Full usage

| Optional options | Description |
| ---------------- | ----------- |
| --amplicons | Use output from primersearch ("prefix.18.primersearch.out") and summarize MIRU profile directly. |
| --details | This option is for further inspection. It displays details of repeat count for each loci with total mismatch error in the primer sequences alignment. |
| --nofasta | Delete fasta file generated if your input read is in fastq format. |
| Main options | Description |
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| -r READS | Input reads file in fastq/fasta format, can be gzipped or not gzipped |
| -p PREFIX | Sample ID required for naming output file. |
| --table TABLE | Allele calling table, default is MIRU_table. Can be user-defined in fixed format. However, providing custom allele calling table for other VNTR is not tested. |
| --primers PRIMERS | Primers sequences, default is MIRU_primers. Can be user-defined in fixed format. |

| Optional options | Description |
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| --amplicons | Use output from primersearch ("prefix.18.primersearch.out") and summarize MIRU profile directly. |
| --nofasta | Delete fasta file generated if your input read is in fastq format. |
| --mismatch | Allowed percent mismatch. Default: 18 |
| --min_amplicons | Minimum number of amplicons required for a reliable result. Below this threshold, the program returns "Warning 1" for low coverage. Default: 3 |
| --freq | Minimum frequency required for a reliable result. Below this threshold, the program returns "Warning 2" for an unfixed allele. Default: 0.6 |
| --amplicon_freq | Number of amplicons required for reliable results with mixed alleles. Below this threshold, the program returns "Warning 2" for an unfixed allele. Default: 20 [Flag commented] |
| --amplicon_mode | Number of amplicons required for reliable results. Below this threshold, the program returns "Warning 3" for a possible polyclonal allele with low coverage. Default: 10 |

| Interpretation | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Warning 1 | Low coverage. Fewer than 3 amplicons (default) as indicated by the `--min_amplicons` flag |
| Warning 2 | Unfixed allele. Majority value frequency is less than 0.6 (default) at the locus, as indicated by the `--freq` flag |
| Warning 3 | Possible polyclonal - Low coverage. Two modes with the same frequency and fewer than 10 amplicons (default), as indicated by the `--amplicon_mode` flag |
| Warning 4 | Possible polyclonal. Two modes and more than 10 amplicons (default) validating the locus, as indicated by the `--amplicon_mode` flag |

All warnings must be taken into account due to low coverage or frequencies, and they should be inspected manually or even repeated.

If analyzing from a `.fasta assembly`, 'Warning 1' for low coverage will appear, as contigs are used and only a single fragment should support the locus region.

## FAQ
1. **Why are there two MIRU allele calling tables (MIRU_table and MIRU_table_0580)?**

MIRU loci 0580 (MIRU_table_0580) consist of a different numbering system for determination of repeat numbers as compared to the other 23 MIRU locus (MIRU_table) for MTBC isolates.
1. **Why are there two MIRU allele calling tables (MIRU_table and MIRU_table_0580)?**

MIRU loci 0580 (MIRU_table_0580) consist of a different numbering system for determination of repeat numbers as compared to the other 23 MIRU locus (MIRU_table) for MTBC isolates.

## Troubleshooting
1. If an error message `OSError: primersearch is not found.` appears, please ensure your `primersearch` executable file is in your environment path (`echo $PATH`) and can be called directly.

1. If an error message `OSError: primersearch is not found.` appears, please ensure your `primersearch` executable file is in your environment path (`echo $PATH`) and can be called directly.