Add Warnings for low coverage and unfixed suboptimal alleles #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

SergioBuenestadoSerrano wants to merge 4 commits into phglab:master from MG-IiSGM:master

LICENSE

100644 → 100755

Empty file.

MIRUReader.py

100644 → 100755

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -12,7 +12,28 @@
  
    from collections import Counter

    #function to determine repeat number based on total number of mismatches in primer sequence

    # Function that corrects the mode() function where it does not always return statistical error

    def custom_mode(List):

        counts = Counter(List)

        max_count = max(counts.values())

        modes = [key for key, value in counts.items() if value == max_count]

        if len(modes) == 1:

            return modes[0]

        else:

            raise statistics.StatisticsError

    # Function to extract multiples modes

    def modes(List):

        counts = Counter(List)

        max_count = max(counts.values())

        modes = [key for key, value in counts.items() if value == max_count]

        return modes

    # Function to determine repeat number based on total number of mismatches in primer sequence

    def chooseMode(name, table, CounterList):

        maxcount = max(CounterList.values())

        repeatToCheck = []

    @@ -54,24 +75,33 @@ def chooseMode(name, table, CounterList):
  
    MIRU_primers = script_dir + "/MIRU_primers"

    parser = argparse.ArgumentParser()

    main_group = parser.add_argument_group('Main options')

    main_group.add_argument('-r', '--reads', required=True, help='input reads file in fastq/fasta format (required)')

    main_group.add_argument('-p', '--prefix', required=True, help='sample ID (required)')

    main_group.add_argument('--table', type=str, default=MIRU_table, help='allele calling table')

    main_group.add_argument('--primers', type=str, default=MIRU_primers, help='primers sequences')

    optional_group = parser.add_argument_group('Optional options')

    optional_group.add_argument('--amplicons', help='provide output from primersearch and summarize MIRU profile directly', action='store_true')

    optional_group.add_argument('--details', help='for inspection', action='store_true')

    optional_group.add_argument('--mismatch', type=int, dest="mismatch", required=False, default=18, help="Allowed percent mismatch. Default: 18")

    optional_group.add_argument('--nofasta', help='delete the fasta reads file generated if your reads are in fastq format', action='store_true')

    optional_group.add_argument('--min_amplicons', type=int, dest='min_amplicons', required=False, default=3, help='Minimum number of amplicons required for a reliable result. Below this threshold, the program returns "Warning 1" for low coverage. Default: 3')

    optional_group.add_argument('--freq', type=float, dest='freq', required=False, default=0.6, help='Minimum frequency required for a reliable result. Below this threshold, the program returns "Warning 2" for an unfixed allele. Default: 0.6')

    # optional_group.add_argument('--amplicon_freq', type=int, dest='amplicon_freq', required=False, default=20, help='Number of amplicons required for reliable results with mixed alleles. Below this threshold, the program returns "Warning 2" for an unfixed allele. Default: 20')

    optional_group.add_argument('--amplicon_mode', type=int, dest='amplicon_mode', required=False, default=10, help='Number of amplicons required for reliable results. Below this threshold, the program returns "Warning 3" for a possible polyclonal allele with low coverage. Default: 10')

    args = parser.parse_args()

    if not os.path.exists(args.reads):

        sys.exit('Error: ' + args.reads + ' is not found!')

    sample_prefix = args.prefix

    sample_dir = os.path.dirname(os.path.abspath(args.reads))

    mismatch_allowed = 18

    psearchOut = sample_dir + '/' + sample_prefix + '.' + str(mismatch_allowed) + '.primersearch.out'

    mismatch_allowed = args.mismatch

    psearchOut = sample_dir + '/' + sample_prefix + '.' + str(args.mismatch) + '.primersearch.out'

    df = pd.read_csv(MIRU_table, sep='\t')

    df_0580 = pd.read_csv(MIRU_table_0580, sep='\t')

    @@ -107,7 +137,7 @@ def chooseMode(name, table, CounterList):
  
    if not args.amplicons:

        try:

            subprocess_args = ['primersearch', '-seqall', fastaReads, '-infile', args.primers, '-mismatchpercent', str(mismatch_allowed), '-outfile', psearchOut]

            subprocess_args = ['primersearch', '-seqall', fastaReads, '-infile', args.primers, '-mismatchpercent', str(args.mismatch), '-outfile', psearchOut]

            subprocess.call(subprocess_args)

        except OSError:

            print('OSError: primersearch command is not found.')

    @@ -179,29 +209,27 @@ def chooseMode(name, table, CounterList):
  
                                    repeats.setdefault(loci).append(0)

                                    lookup.setdefault(primerID).append(0)

    if args.details:

        myLookUp = pd.DataFrame(columns=["loci", "hit_index", "repeat_no", "error_no"])

        for key, value in lookup.items():

            #example: lookup = {'0154_1':[2,4]} total no. of mismatches, repeat number

            myLookUp = myLookUp.append({"loci":key.split("_")[0], "hit_index":int(key.split("_")[1]), "repeat_no":lookup[key][1], "error_no":lookup[key][0]}, ignore_index=True)

        sortedLookUp = myLookUp.sort_values(by=["loci", "hit_index"])

        print(sortedLookUp.to_csv(sep='\t', index=False))

        for item in miru:

            #array that used to determine repeat number

            print(Counter(repeats[item]))

    miru_repeats = pd.DataFrame(columns = ['sample_prefix'] + miru, index = range(1))

    miru_repeats['sample_prefix'] = sample_prefix

    for item in miru:

        if repeats[item] != []:

            try:

                repeat = mode(repeats[item])

                miru_repeats[item][0] = repeat

                if len(repeats[item]) < args.min_amplicons:

                    repeat = f"{custom_mode(repeats[item])} (Warning 1: Low Coverage)"

                # elif repeats[item].count(mode(repeats[item])) / len(repeats[item]) <= args.freq and len(repeats[item]) <= args.amplicon_freq: ## If you need to put some minimum number of amplicon for those unfixed alleles, uncomment this line and the flag corresponded.

                elif repeats[item].count(mode(repeats[item])) / len(repeats[item]) <= args.freq:

                    repeat = f"{custom_mode(repeats[item])} (Warning 2: Unfixed allele)"

                else:

                    repeat = custom_mode(repeats[item])

            except statistics.StatisticsError:

                repeat = chooseMode(item, lookup, Counter(repeats[item]))

                miru_repeats[item][0] = repeat

                if len(repeats[item]) < args.amplicon_mode:

                    repeat = f"{chooseMode(item, lookup, Counter(repeats[item]))} (Warning 3: Possible polyclonal {modes(repeats[item])}, Low Coverage)"

                else:

                    repeat = f"{chooseMode(item, lookup, Counter(repeats[item]))} (Warning 4: Possible polyclonal {modes(repeats[item])})"

        else:

            miru_repeats[item][0] = "ND"

            repeat = "ND"

        miru_repeats[item][0] = repeat

    if args.nofasta:

        if ('.fastq' in args.reads) or ('.gz' in args.reads):

MIRU_primers

100644 → 100755

Empty file.

MIRU_table

100644 → 100755

Empty file.

MIRU_table_0580

100644 → 100755

Empty file.

README.md

100644 → 100755

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -2,50 +2,62 @@
  
    ## Description

    Identify 24-locus MIRU-VNTR for *Mycobacterium tuberculosis* complex (MTBC) directly from long reads generated by Oxford Nanopore Technologies (ONT) and Pacific Biosciences (PacBio). Also work on assembled genome.

    Identify 24-locus MIRU-VNTR for _Mycobacterium tuberculosis_ complex (MTBC) directly from long reads generated by Oxford Nanopore Technologies (ONT) and Pacific Biosciences (PacBio). Also work on assembled genome.

    ## Requirements

    * Linux

    * primersearch from [EMBOSS](http://emboss.sourceforge.net/download/)

       * install from the official website or

       * install via conda `conda install -c bioconda emboss`

       * Ensure the primersearch command is in your device's environment path, where primersearch program can be executed directly by typing `primersearch` on the commandline

    * [*pandas*](https://pandas.pydata.org/) 

       * can be installed via conda `conda install pandas` or via PyPI `pip install pandas`

    * [*statistics*](https://pypi.org/project/statistics/)

       * can be installed via PyPI `pip install statistics`

    - Linux

    - primersearch from [EMBOSS](http://emboss.sourceforge.net/download/)

      - install from the official website or

      - install via conda `conda install -c bioconda emboss`

      - Ensure the primersearch command is in your device's environment path, where primersearch program can be executed directly by typing `primersearch` on the commandline

    - [_pandas_](https://pandas.pydata.org/)

      - can be installed via conda `conda install pandas` or via PyPI `pip install pandas`

    - [_statistics_](https://pypi.org/project/statistics/)

      - can be installed via PyPI `pip install statistics`

    ## Installation

    `git clone https://github.com/phglab/MIRUReader.git`

    `git clone https://github.com/phglab/MIRUReader.git` or `git clone https://github.com/MG-IiSGM/MIRUReader` (for this modified version)

    ## Change log

    #### 03/07/2024

    - Added different depth and frequency parameters to flag potential suboptimal alleles.

    - Updated interpretation documentation to the README

    #### 13/09/2019

    - Added a check to ensure primersearch is executable prior to MIRUReader program execution

    - Updated documentation to the README

    #### 04/07/2019

    - Update output format for option '--details'.

    #### 14/06/2019

    - Auto convert fastq to fasta.

    ## Usage example

    For one sample analysis:

    ```

    python /your/path/to/MIRUReader.py -r sample.fasta -p sampleID > miru.txt

    ```

    For multiple samples analysis:

    1. Create a mapping file (mappingFile.txt) that looks like:

        sample_001.fasta sample_001 \

        sample_002.fasta sample_002 \

        ...

       sample_001.fasta sample_001 \

       sample_002.fasta sample_002 \

       ...

    2. Then run the program:

    ```

    cat mappingFile.txt | while read -a line; do python /your/path/to/MIRUReader.py -r ${line[0]} -p ${line[1]}; done > miru.multiple.txt

    ```

    @@ -58,31 +70,47 @@ sample_001      2       4       4       2       3       3       3       2
  
    ```

    Notes:

    * The program is compatible to Python 2 and Python 3.

    * Accepted reads file format includes '.fastq', '.fastq.gz', '.fasta', and '.fasta.gz'.

    * The program output is a tab-delimited plain text which can be copied to or opened in Excel spreadsheet.

    ## Full usage

    | Main options | Description |

    | ------------ | ----------- |

    | -r READS | Input reads file in fastq/fasta format, can be gzipped or not gzipped |

    | -p PREFIX | Sample ID required for naming output file. |

    | --table TABLE | Allele calling table, default is MIRU_table. Can be user-defined in fixed format. However, providing custom allele calling table for other VNTR is not tested. |

    | --primers PRIMERS | Primers sequences, default is MIRU_primers. Can be user-defined in fixed format. |

    - The program is compatible to Python 2 and Python 3.

    - Accepted reads file format includes '.fastq', '.fastq.gz', '.fasta', and '.fasta.gz'.

    - The program output is a tab-delimited plain text which can be copied to or opened in Excel spreadsheet.

    ## Full usage

    | Optional options | Description |

    | ---------------- | ----------- |

    | --amplicons | Use output from primersearch ("prefix.18.primersearch.out") and summarize MIRU profile directly. |

    | --details | This option is for further inspection. It displays details of repeat count for each loci with total mismatch error in the primer sequences alignment. |

    | --nofasta | Delete fasta file generated if your input read is in fastq format. |

    | Main options      | Description                                                                                                                                                    |

    | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |

    | -r READS          | Input reads file in fastq/fasta format, can be gzipped or not gzipped                                                                                          |

    | -p PREFIX         | Sample ID required for naming output file.                                                                                                                     |

    | --table TABLE     | Allele calling table, default is MIRU_table. Can be user-defined in fixed format. However, providing custom allele calling table for other VNTR is not tested. |

    | --primers PRIMERS | Primers sequences, default is MIRU_primers. Can be user-defined in fixed format.                                                                               |

    | Optional options | Description                                                                                                                                                                     |

    | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |

    | --amplicons      | Use output from primersearch ("prefix.18.primersearch.out") and summarize MIRU profile directly.                                                                                |

    | --nofasta        | Delete fasta file generated if your input read is in fastq format.                                                                                                              |

    | --mismatch       | Allowed percent mismatch. Default: 18                                                                                                                                           |

    | --min_amplicons  | Minimum number of amplicons required for a reliable result. Below this threshold, the program returns "Warning 1" for low coverage. Default: 3                                  |

    | --freq           | Minimum frequency required for a reliable result. Below this threshold, the program returns "Warning 2" for an unfixed allele. Default: 0.6                                     |

    | --amplicon_freq  | Number of amplicons required for reliable results with mixed alleles. Below this threshold, the program returns "Warning 2" for an unfixed allele. Default: 20 [Flag commented] |

    | --amplicon_mode  | Number of amplicons required for reliable results. Below this threshold, the program returns "Warning 3" for a possible polyclonal allele with low coverage. Default: 10        |

    | Interpretation | Description                                                                                                                                             |

    | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |

    | Warning 1      | Low coverage. Fewer than 3 amplicons (default) as indicated by the `--min_amplicons` flag                                                               |

    | Warning 2      | Unfixed allele. Majority value frequency is less than 0.6 (default) at the locus, as indicated by the `--freq` flag                                     |

    | Warning 3      | Possible polyclonal - Low coverage. Two modes with the same frequency and fewer than 10 amplicons (default), as indicated by the `--amplicon_mode` flag |

    | Warning 4      | Possible polyclonal. Two modes and more than 10 amplicons (default) validating the locus, as indicated by the `--amplicon_mode` flag                    |

    All warnings must be taken into account due to low coverage or frequencies, and they should be inspected manually or even repeated.

    If analyzing from a `.fasta assembly`, 'Warning 1' for low coverage will appear, as contigs are used and only a single fragment should support the locus region.

    ## FAQ

    1. **Why are there two MIRU allele calling tables (MIRU_table and MIRU_table_0580)?** 

    MIRU loci 0580 (MIRU_table_0580) consist of a different numbering system for determination of repeat numbers as compared to the other 23 MIRU locus (MIRU_table) for MTBC isolates.  

    1. **Why are there two MIRU allele calling tables (MIRU_table and MIRU_table_0580)?**

    MIRU loci 0580 (MIRU_table_0580) consist of a different numbering system for determination of repeat numbers as compared to the other 23 MIRU locus (MIRU_table) for MTBC isolates.

    ## Troubleshooting

    1. If an error message `OSError: primersearch is not found.` appears, please ensure your `primersearch` executable file is in your environment path (`echo $PATH`) and can be called directly. 

    1. If an error message `OSError: primersearch is not found.` appears, please ensure your `primersearch` executable file is in your environment path (`echo $PATH`) and can be called directly.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add Warnings for low coverage and unfixed suboptimal alleles #2

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Add Warnings for low coverage and unfixed suboptimal alleles #2

Are you sure you want to change the base?

Uh oh!

Add Warnings for low coverage and unfixed suboptimal alleles #2

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing