diff --git a/docs/commands/validate.rst b/docs/commands/validate.rst new file mode 100644 index 00000000..e4ac5902 --- /dev/null +++ b/docs/commands/validate.rst @@ -0,0 +1,76 @@ +.. _commands-validate: + + +validate +======== + +Validate the formatting of a :doc:`.hap file `. Output warnings/errors explaining how the formatting of your ``.hap`` file may be improved. + +If a :ref:`.pvar file ` file is provided, the SNPs and TRs present in the ``.hap`` file will be checked for existence in the ``.pvar`` file. + +.. note:: + + This command will not check that your ``.hap`` file is properly sorted. It only checks formatting. + +Usage +~~~~~ +.. code-block:: bash + + haptools validate \ + --sort \ + --genotypes PATH \ + --verbosity [CRITICAL|ERROR|WARNING|INFO|DEBUG|NOTSET] \ + HAPFILE + +Examples +~~~~~~~~ +.. code-block:: bash + + haptools validate tests/data/validate/basic.hap + +Outputs a message specifying the amount of errors and warnings. + +.. code-block:: + + [ INFO] Completed .hap file validation with 0 errors and 0 warnings. + +All warnings and errors will be logged if there are any. + +.. code-block:: bash + + haptools validate tests/data/validate/no_version.hap + +.. code-block:: + + [ WARNING] No version declaration found. Assuming to use the latest version. + [ INFO] Completed .hap file validation with 0 errors and 1 warnings. + Error: Found several warnings and / or errors in the .hap file + +All ``.hap`` files must be sorted before they can be validated, so we try our best to sort your ``.hap`` file internally before performing any validation checks. +If your ``.hap`` file is already sorted, you should use the ``--sorted`` parameter. It will speed things up a bit by skipping the sorting step. If your ``.hap`` file is indexed, it will be assumed to be sorted regardless. + +.. code-block:: bash + + haptools validate --sorted tests/data/simple.hap + +As mentioned before, one can use the ``--genotypes`` flag to provide a ``.pvar`` file with which to compare the existence of variant IDs. +The following will check if all of the variant IDs in the ``.hap`` file appear in the ``.pvar`` file. + +.. code-block:: bash + + haptools validate --genotypes tests/data/simple.pvar tests/data/simple.hap + +.. note:: + + We accept a PVAR file instead of a VCF in order to avoid reading lots of information + which is not relevant to the validation process. However, any VCF subsetted to just + its first 8 fields is a valid PVAR file. So you can easily create a PVAR file from a + VCF using ``cut -f -8`` or ``plink2 --make-just-pvar``. + +Detailed Usage +~~~~~~~~~~~~~~ + +.. click:: haptools.__main__:main + :prog: haptools + :show-nested: + :commands: validate diff --git a/docs/formats/genotypes.rst b/docs/formats/genotypes.rst index 079edd4d..ec4bb970 100644 --- a/docs/formats/genotypes.rst +++ b/docs/formats/genotypes.rst @@ -18,7 +18,7 @@ Genotype files must be specified as VCF or BCF files. They can be bgzip-compress PLINK2 PGEN ~~~~~~~~~~~ -There is also experimental support for `PLINK2 PGEN `_ files in some commands. These files can be loaded and created much more quickly than VCFs, so we highly recommend using them if you're working with large datasets. See the documentation for the :class:`GenotypesPLINK` class in :ref:`the API docs ` for more information. +There is also experimental support for `PLINK2 PGEN `_ files (accomponied by PVAR and PSAM files) in some commands. These files can be loaded and created much more quickly than VCFs, so we highly recommend using them if you're working with large datasets. See the documentation for the :class:`GenotypesPLINK` class in :ref:`the API docs ` for more information. If you run out memory when using PGEN files, consider reading/writing variants from the file in chunks via the ``--chunk-size`` parameter. diff --git a/docs/index.rst b/docs/index.rst index 0d957c62..7e02fd34 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,6 +20,8 @@ Commands * :doc:`haptools transform `: Transform a set of genotypes via a list of haplotypes. Create a new VCF containing haplotypes instead of variants. +* :doc:`haptools validate `: Validate the formatting of a haplotype file. + * :doc:`haptools index `: Sort, compress, and index our custom file format for haplotypes. * :doc:`haptools clump `: Convert variants in LD with one another into clumps. @@ -95,6 +97,7 @@ There is an option to *Cite this repository* on the right sidebar of `the reposi commands/simphenotype.rst commands/karyogram.rst commands/transform.rst + commands/validate.rst commands/index.rst commands/clump.rst commands/ld.rst diff --git a/haptools/__main__.py b/haptools/__main__.py index 39cf3322..d67bfe34 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1025,6 +1025,61 @@ def clump( ) +@main.command(short_help="Validate the structure of a .hap file") +@click.argument("filename", type=click.Path(exists=True, path_type=Path)) +@click.option( + "--sorted/--not-sorted", + is_flag=True, + default=False, + show_default=True, + help="Has the file been sorted already?", +) +@click.option( + "--genotypes", + type=click.Path(path_type=Path), + default=None, + show_default="optional .pvar file to compare against", + help=( + "A .pvar file containing variant IDs in order to compare them to the .hap file" + ), +) +@click.option( + "-v", + "--verbosity", + type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"]), + default="INFO", + show_default=True, + help="The level of verbosity desired", +) +def validate( + filename: Path, + sorted: bool = False, + genotypes: Path | None = None, + verbosity: str = "INFO", +): + """ + Validate the formatting of a .hap file + + Output warnings/errors explaining how the formatting of your .hap file may + be improved. + """ + from .logging import getLogger + from .validate import is_hapfile_valid + + log = getLogger(name="validate", level=verbosity) + + # if the hap file is compressed and a .tbi index exists for it, assume it is sorted + if filename.suffix == ".gz" and filename.with_suffix(".gz.tbi").exists(): + sorted = True + + is_valid = is_hapfile_valid(filename, sorted=sorted, log=log, pvar=genotypes) + + if not is_valid: + raise click.ClickException( + "Found several warnings and / or errors in the .hap file" + ) + + if __name__ == "__main__": # run the CLI if someone tries 'python -m haptools' on the command line main(prog_name="haptools") diff --git a/haptools/transform.py b/haptools/transform.py index df70f6cb..807dca28 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -175,7 +175,7 @@ class GenotypesAncestry(data.GenotypesVCF): See documentation for :py:attr:`~.Genotypes.log` """ - def __init__(self, fname: Path | str, log: Logger = None): + def __init__(self, fname: Path | str, log: logging.Logger = None): super().__init__(fname, log) self.ancestry = None self.valid_labels = None diff --git a/haptools/validate.py b/haptools/validate.py new file mode 100644 index 00000000..c9baec35 --- /dev/null +++ b/haptools/validate.py @@ -0,0 +1,1074 @@ +from __future__ import annotations + +import os +import logging +from re import search +from pathlib import Path + +from .logging import getLogger +from .data import GenotypesPLINK + + +LOGGER_NAME = "validate" + + +def tmpex(expectation: object, received: object) -> str: + return f"Expected: {expectation}\nReceived: {received}" + + +class Line: + """ + A line in the file + + Attributes + ---------- + columns : list[str] + The line split into separate columns + content : str + The content of the line as a string + number : int + The line number + count : int + The number of columns in this line + """ + def __init__(self, content: str, number: int): + self.content: str = content + self.number: int = number + + self.columns: list[str] = content.split() + self.count: int = len(self.columns) + + def __getitem__(self, index: int) -> str: + """ + Index into the columns of the line + + Parameters + ---------- + index : int + The index into the line. Must be less than :py:attr:`~.Line.count` + + Returns + ------- + str + The column at this index + """ + return self.columns[index] + + def __str__(self) -> str: + """ + Retrieve the line as a string + + Returns + ------- + str + The line + """ + return self.content + + +class HapFileIO: + """ + Process lines from a .hap file + + Attributes + ---------- + filename : Path + The path to the file + logger : logging.Logger, optional + A logging instance for recording errors/warnings statements + """ + def __init__(self, filename: Path, logger: logging.Logger = None): + self.filename = filename + self.log = logger or logging.getLogger(self.__class__.__name__) + + def lines(self, sorted: bool = False) -> list[Line]: + """ + Retrieve the lines of the file as Line instances + + Sort the lines if they're unsorted + + Parameters + ---------- + sorted : bool, optional + Whether the file can be assumed to be sorted + + Returns + ------- + list[Line] + The lines of the file + """ + buffer = open(self.filename) + + content = [ + Line(line.strip(), i + 1) + for i, line in enumerate(buffer.readlines()) + if line and (not line.isspace()) + ] + + buffer.close() + + if not sorted: + self.log.debug("Assuming .hap file is unsorted. Attempting to sort.") + meta_limit = next( + idx for idx, line in enumerate(content) if not line[0].startswith("#") + ) + content = [ + line + for idx, line in enumerate(content) + if (not line[0].startswith("#")) or idx < meta_limit + ] + content.sort(key=lambda line: ord(line[0][0])) + self.log.debug("Finished sorting .hap file") + + return content + + def validate_existence(self) -> bool: + """ + Check whether the .hap file exists and can be read + + Returns + ------- + bool + True if it exists and False otherwise + """ + if not self.exists(): + self.log.error(f"The file {self.filename} does not exist.") + return False + + is_ok = True + + if not self.is_regular(): + self.log.error(f"Cannot read {self.filename}: Is not a regular file.") + is_ok = False + + if not self.is_readable(): + self.log.error(f"Cannot read {self.filename}: Insufficient permissions.") + is_ok = False + + return is_ok + + def exists(self) -> bool: + """ + Check if the file exists + + Returns + ------- + bool + True if it exists and False otherwise + """ + return self.filename.exists() + + def is_regular(self): + """ + Check if the file can be opened by python + + Symlinks are also allowed + + Returns + ------- + bool + True if it can be opened and False otherwise + """ + return self.filename.is_file() + + def is_readable(self) -> bool: + """ + Check if the file can be read by python + + Returns + ------- + bool + True if it can be read and False otherwise + """ + return os.access(self.filename, os.R_OK) + + +class HapFileValidator: + """ + Validate lines from a .hap file + + Attributes + ---------- + log : logging.Logger, optional + A logging instance for recording errors/warnings statements + vars_ex : dict[int, dict[str, type]] + The names of each of the extra columns for each of the line types. The keys of + the outer dict encode each line type and the keys of the inner dict encode each + extra column + types_ex : dict[int, list[type]] + The types of each of the extra columns for each of the line types. The keys of + the outer dict encode each line type and the keys of the inner dict encode each + extra column + meta_lines : list[Line] + The metadata lines in the file + data : dict[int, list[Line]] + A list of the lines, delineated by their line type (as the keys to the dict) + hrids : dict[int, dict[str, Line]] + Each haplotype and repeat line, keyed by its ID. The outer dictionary encodes + line types + vrids : dict[str, dict[str, Line]] + Each variant line, keyed by its ID. The outer dictionary encodes line types + referenced_chromosomes : set[str] + A running list of the chromosomes that have been seen + errc : int + A running count of the errors we've seen + warc : int + A running count of the warnings we've seen + """ + # H CHROM START END ID + MANDATORY_HAPLOTYPE_COLUMN_COUNT: int = 5 + + # R CHROM START END ID LN + MANDATORY_REPEAT_COLUMN_COUNT: int = 5 + + # V CHROM START END ID CHROM LN + MANDATORY_VARIANT_COLUMN_COUNT: int = 6 + + # # version + MANDATORY_VERSION_COLUMNS: int = 3 + + # #X Name Type [Description] + MANDATORY_DEFINITION_COLUMNS = 3 + + KEY_HAPLOTYPE: int = 0 + KEY_REPEAT: int = 1 + KEY_VARIANT: int = 2 + KEY_VARIANT_SRC: int = 9 + + NAME_HAPLOTYPE = "Haplotype" + NAME_REPEAT = "Repeat" + NAME_VARIANT = "Variant" + + KEY_KEY: str = "HT::Key" + KEY_CHROMOSOME: str = "HT::Chromosome" + KEY_START: str = "HT::Start" + KEY_END: str = "HT::End" + KEY_ID: str = "HT::ID" + KEY_ALLELE: str = "HT::Allele" + + def __init__(self, logger: logging.Logger = None): + self.log = logger or logging.getLogger(self.__class__.__name__) + + self.vars_ex: dict[int, dict[str, type]] = { + HapFileValidator.KEY_HAPLOTYPE: {}, + HapFileValidator.KEY_REPEAT: {}, + HapFileValidator.KEY_VARIANT: {}, + } + + self.types_ex: dict[int, list[type]] = { + HapFileValidator.KEY_HAPLOTYPE: [], + HapFileValidator.KEY_REPEAT: [], + HapFileValidator.KEY_VARIANT: [], + } + + self.meta_lines: list[Line] = [] + self.data: dict[int, list[Line]] = { + HapFileValidator.KEY_HAPLOTYPE: [], + HapFileValidator.KEY_REPEAT: [], + HapFileValidator.KEY_VARIANT: [], + } + + self.hrids: dict[int, dict[str, Line]] = { + HapFileValidator.KEY_HAPLOTYPE: {}, + HapFileValidator.KEY_REPEAT: {}, + } + + self.vrids: dict[str, dict[str, Line]] = {} + + self.referenced_chromosomes: set[str] = set() + + self.errc: int = 0 + self.warc: int = 0 + + def extract_and_store_content(self, file: HapFileIO, sorted: bool = False): + """ + Extract the header and data lines of a HapFileIO instance + + Parameters + ---------- + file : HapFileIO + The file object to extract and store content from. + sorted : bool, optional + Flag indicating whether the lines are already sorted + """ + lines = file.lines(sorted=sorted) + + self.extract_meta_lines(lines) + self.extract_data_lines(lines) + + def extract_meta_lines(self, lines: list[Line]): + """ + Identify header lines in the file + + Parameters + ---------- + lines : list[Line] + The full set of lines, from which the header lines must be extracted + """ + header_limit = next( + i for i, line in enumerate(lines) if not line[0].startswith("#") + ) + self.meta_lines = lines[:header_limit] + + def extract_data_lines(self, lines: list[Line]): + """ + Identify non-header lines and categorize them based on their field type. + + Parameters + ---------- + lines : list[Line] + The full set of lines from the file + """ + # TODO: do not encode H, R, or V here but somewhere global + ln = [ + [ln for ln in lines if ln[0].startswith("H")], + [ln for ln in lines if ln[0].startswith("R")], + [ln for ln in lines if ln[0].startswith("V")], + [ln for ln in lines if ln[0][0] not in ["H", "R", "V", "#"]], + ] + + for l in ln[3]: + self.lefl("Unrecognized field type. Must be one of 'H', 'R' or 'V'.", l) + self.errc += 1 + + for i in range( + HapFileValidator.KEY_HAPLOTYPE, HapFileValidator.KEY_VARIANT + 1 + ): + self.data[i] = ln[i] + + # + # Version Validation + # + + def validate_version_declarations(self): + """ + Confirm that the version declaration is in the correct format + + This method extracts the version declaration and checks it's in the correct + format. If no version declarations are found, we assume the latest version and + issue a warning. + """ + versions = self.extract_version_declarations() + if len(versions) == 0: + self.log.warning( + f"No version declaration found. Assuming to use the latest version." + ) + self.warc += 1 + return + + self.validate_version_format(versions[-1]) + + def extract_version_declarations(self) -> list[Line]: + """ + Extracts version declarations from the meta lines + + Issues warnings for each version declaration after the first, since there + should only ever be one. + + Returns + ------- + list[Line] + A list of version declarations as Line objects + """ + decls = list( + filter(lambda x: x.count > 1 and x[1] == "version", self.meta_lines) + ) + + if len(decls) > 1: + self.log.warning( + f"Found more than one version declaration. Using the last" + f" instance. Each is its own warning." + ) + + for decl in decls: + self.warc += 1 + self.lwfl("", decl, sep="") + + return decls + + def validate_version_format(self, version: Line): + """ + Validates the format of the version declaration + + Parameters + ---------- + version: Line + The line containing the version declaration + """ + if version.count < 3: + self.leexfl( + "Not enough columns in version declaration", + HapFileValidator.MANDATORY_DEFINITION_COLUMNS, + version.count, + version, + ) + self.warnskip(version) + + self.errc += 1 + return + + if search(r"\d+\.\d+\.\d+", version[2]) == None: + self.leexfl( + "Version is incorrectly formatted", + "'x.x.x' where 'x' is an integer", + version[2], + version, + ) + self.errc += 1 + + # + # Column additions + # + + def validate_column_additions(self): + additions = self.find_column_additions() + + for i, k in enumerate(["#H", "#R", "#V"]): + self.add_column_additions_to_header( + i, list(filter(lambda line: line[0] == k, additions)) + ) + + def find_column_additions(self) -> list[Line]: + additions = list( + filter(lambda line: search(r"#[H|R|V]", line[0]) != None, self.meta_lines) + ) + + invalid_lines = [ + x for x in self.meta_lines if x not in additions and len(x[0]) > 1 + ] + + for ln in invalid_lines: + self.leexfl( + "Invalid column addition type.", + "A column addition for 'H', 'R', or 'V'", + f"A column addition for '{ln[0][1]}', whose type doesn't exist", + ln, + ) + self.errc += 1 + + return additions + + def add_column_additions_to_header(self, tp: int, additions: list[Line]): + for addition in additions: + if addition.count < 3: + self.leexfl( + "Insufficient columns for extra column definition", + HapFileValidator.MANDATORY_DEFINITION_COLUMNS, + addition.count, + addition, + ) + self.warnskip(addition) + + self.errc += 1 + return + + ptp = self.retrieve_column_addition_data_type(addition) + + if ptp == object: + self.warnskip(addition) + continue + + self.vars_ex[tp].update({addition[1]: ptp}) + self.types_ex[tp].append(ptp) + + def retrieve_column_addition_data_type(self, addition: Line) -> type: + tp = addition[2] + + if tp == "d": + return int + + if search(r"\.\d+f", addition.content) != None: + return float + + if tp == "s": + return str + + self.leexfl( + "Could not parse type for column addition", + "One of: 'd', 's', '.xf' (where 'x' is an integer)", + f"{addition[2]}", + addition, + ) + + self.errc += 1 + return object + + # + # Minimum Requirements + # + + def validate_columns_fulfill_minreqs(self): + self.validate_haplotypes() + self.validate_repeats() + self.validate_variants() + + def validate_haplotypes(self): + for line in self.data[HapFileValidator.KEY_HAPLOTYPE]: + has_min_cols = self.check_has_min_cols( + line, HapFileValidator.MANDATORY_HAPLOTYPE_COLUMN_COUNT + ) + + self.check_start_and_end_positions(line) + + if not has_min_cols: + self.lwexfl( + "Cannot check for variant references: Insufficient columns", + "A mandatory 5 columns for haplotyes", + line.count, + line, + ) + self.warnskip(line) + + self.warc += 1 + return + + variant_refs = self.vrids.get(line[4]) + + if variant_refs == None: + self.leexfl( + f"Haplotype ID '{line[4]}' is not associated to any variants", + f"A variant association for Haplotype ID '{line[4]}'", + "No association", + line, + ) + + self.errc += 1 + return + + def validate_repeats(self): + for line in self.data[HapFileValidator.KEY_REPEAT]: + self.check_has_min_cols( + line, HapFileValidator.MANDATORY_REPEAT_COLUMN_COUNT + ) + + self.check_start_and_end_positions(line) + + def validate_variants(self): + for line in self.data[HapFileValidator.KEY_VARIANT]: + self.check_has_min_cols( + line, HapFileValidator.MANDATORY_VARIANT_COLUMN_COUNT + ) + + self.check_start_and_end_positions(line) + self.check_variant_alleles(line) + + def check_has_min_cols(self, line: Line, min: int) -> bool: + if line.count < min: + self.leexfl( + "Invalid amount of mandatory columns in definition.", + f"At least {min}", + line.count, + line, + ) + + self.errc += 1 + return False + + return True + + def check_start_and_end_positions(self, line: Line): + if line.count < 4: + self.lwfl( + "Cannot validate start and end positions: Insufficient columns", line + ) + self.warnskip(line) + + self.warc += 1 + return + + f = False + + if not line[2].isdigit(): + self.leexfl( + "Cannot convert start position to integer", + "Integer values for the start position", + line[2], + line, + ) + + self.errc += 1 + f = True + + if not line[3].isdigit(): + self.leexfl( + "Cannot convert end position to integer", + "Integer values for the end position", + line[3], + line, + ) + + self.errc += 1 + f = True + + if f: + self.lwfl( + "Cannot test for correct position order due to previous errors" + " (Inconvertible integers)", + line, + ) + self.warnskip(line) + + self.warc += 1 + return + + start = int(line[2]) + end = int(line[3]) + + if start > end: + self.leexfl( + "Start position is greater than the end position", + f"Start to be positioned at or before the end", + f"{start} > {end} | Difference of {start - end}", + line, + ) + + self.errc += 1 + + if line.count < 5: + self.lwexfl( + "Cannot perform position validations against variant definitions:" + " Insufficient columns.", + 5, + line.count, + line, + ) + self.warnskip(line) + + self.warc += 1 + return + + variant_refs = self.vrids.get(line[4]) + + if variant_refs == None: + return + + for id, ln in variant_refs.items(): + if not ln[2].isdigit(): + self.leexfl( + "Variant start position cannot be converted to an integer.", + "An integer", + ln[2], + ln, + ) + self.warnskip(line) + + self.errc += 1 + return + + if not ln[3].isdigit(): + self.leexfl( + "Variant end position cannot be converted to an integer.", + "An integer", + ln[3], + ln, + ) + self.warnskip(line) + + self.errc += 1 + return + + vstart = int(ln[2]) + vend = int(ln[3]) + + if vstart < start: + self.leexfl( + "Variant start position cannot be prior to the start position of" + " its haplotype.", + "The variant to start after or when the haplotype does", + f"[Variant] {vstart} < [Haplotype] {start} | Difference of" + f" {start - vstart}", + line, + ) + self.log.warning(f"At Line #{ln.number}: {ln}") + + self.errc += 1 + + if vend > end: + self.leexfl( + "Variant end position cannot be after than the end position of its" + " haplotype.", + "The variant to end before or when the haplotype does", + f"[Variant] {vend} > [Haplotype] {end} | Difference of" + f" {vend - end}", + line, + ) + self.log.warning(f"At Line #{ln.number}: {ln}") + + self.errc += 1 + + def check_variant_alleles(self, line: Line): + if line.count < HapFileValidator.MANDATORY_VARIANT_COLUMN_COUNT: + self.lwexfl( + "Cannot test for variant allele type: Not enough columns.", + HapFileValidator.MANDATORY_VARIANT_COLUMN_COUNT, + line.count, + line, + ) + self.warnskip(line) + + self.warc += 1 + return + + if line[5].upper() not in ["A", "C", "G", "T"]: + self.leexfl( + "Invalid allele type in variant.", + "One of 'A', 'C', 'G', 'T'", + f"'{line[5]}'", + line, + ) + + self.errc += 1 + + # + # ID Storage + # + + def store_ids(self): + for tp in range(2): + for line in self.data[tp]: + self.store_hrid(tp, line) + + for line in self.data[HapFileValidator.KEY_VARIANT]: + self.store_variant_id(line) + + def store_hrid(self, tp: int, line: Line): + should_skip = False + if line.count < 2: + self.lwexfl( + "Cannot extract chromosome ID: Insufficient columns.", + "At least 1 column", + line.count, + line, + ) + + self.warc += 1 + self.warnskip(line) + return + + if line.count < 5: + self.lwexfl( + "Cannot extract ID: Insufficient columns.", + f"At least 5 for ID extraction", + line.count, + line, + ) + + self.warnskip(line) + + self.warc += 1 + return + + self.referenced_chromosomes.add(line[1]) + + if line[4] in self.hrids[tp]: + self.leexfl("Duplicate ID.", "A unique ID", f"'{line[4]}'", line) + self.log.warning( + f"Originally defined at: line #{self.hrids[tp][line[4]].number}" + f"\n:: {self.hrids[tp][line[4]].content}" + ) + + self.warnskip(line) + + self.errc += 1 + return + + if line[4] in self.referenced_chromosomes: + self.lwfl(f"ID '{line[4]}' is already registered as a chromosome.", line) + self.warnskip(line) + + self.warc += 1 + return + + self.hrids[tp].update({line[4]: line}) + + def store_variant_id(self, line: Line): + if line.count < 5: + self.lwexfl( + "Cannot extract ID: Insufficient columns.", + f"At least 5 for ID extraction", + line.count, + line, + ) + + self.warnskip(line) + + self.warc += 1 + return + + if not line[1] in self.vrids.keys(): + self.vrids.update({line[1]: {}}) + + if line[4] in self.vrids[line[1]].keys(): + self.leexfl( + "Duplicate variant in for a same haplotype ID.", + "A unique ID per haplotype", + f"'{line[4]}'", + line, + ) + self.log.warning( + f"Originally defined at: line #{self.vrids[line[1]][line[4]].number}" + f"\n{self.vrids[line[1]][line[4]].content}" + ) + + self.warnskip(line) + + self.errc += 1 + return + + if line[4] in self.referenced_chromosomes: + self.lwfl(f"ID '{line[4]}' is already registered as a chromosome.", line) + self.warnskip(line) + + self.warc += 1 + return + + self.vrids[line[1]].update({line[4]: line}) + + # + # Variant Validation + # + + def validate_variant_ids(self): + for haplotype, ids in self.vrids.items(): + no_haplotype = False + for id, line in ids.items(): + if haplotype not in self.hrids[HapFileValidator.KEY_HAPLOTYPE].keys(): + self.lefl( + f"Cannot link variant '{id}' to non-exisent haplotype" + f" '{haplotype}'", + line, + ) + no_haplotype = True + + self.errc += 1 + continue + + if no_haplotype: + self.log.warning( + f"Define haplotype '{haplotype}' or fix the variant" + " haplotype reference" + ) + + # + # Extra field validation + # + + def validate_extra_fields(self): + for tp in range( + HapFileValidator.KEY_HAPLOTYPE, HapFileValidator.KEY_VARIANT + 1 + ): + excol_count = len(self.types_ex[tp]) + lines = self.data[tp] + + for line in lines: + rs = 5 if tp != HapFileValidator.KEY_VARIANT else 6 + extras = line.count - rs + if extras != excol_count: + self.lwexfl( + "Invalid amount of extra columns in line.", + excol_count, + extras, + line, + ) + + if extras < 0: + self.lefl("There aren't even enough mandatory columns", line) + self.warc += 1 + + self.warnskip(line) + + self.warc += 1 + continue + + for ptp, col in zip(self.types_ex[tp], line.columns[rs:]): + conv = self.determine_if_is_convertible(col, ptp) + + if not conv: + self.leexfl( + "Value in extra column is not convertible to the associated" + " type", + f"A value that can be converted to a(n) {str(ptp)[8:-2]}", + col, + line, + ) + + self.errc += 1 + + def determine_if_is_convertible(self, what: str, tp: type) -> bool: + if tp == int: + return what.isdigit() + + if tp == float: + return search(r"\d*\.?\d+$", what) != None + + return tp == str + + # + # Extra field reordering + # + + def reorder_extra_fields(self): + reordering_metalns = list( + filter( + lambda line: line.count > 1 and search("order[H|R|V]", line[1]) != None, + self.meta_lines, + ) + ) + + for i, c in enumerate(["H", "R", "V"]): + relevant = list(filter(lambda line: line[1][5] == c, reordering_metalns)) + + if len(relevant) == 0: + continue + + if len(relevant) > 1: + self.log.warning( + f"Found multiple order{c} definition lines. Using the last" + " available one." + ) + self.warc += 1 + + ln = relevant[-1] + + self.reorder_field_types(i, ln) + + def reorder_field_types(self, tp: int, line: Line): + extpc = len(self.vars_ex[tp].keys()) + exclc = line.count - 2 + + if extpc > exclc: + self.leexfl( + "Not enough columns in extra column reordering", + extpc + 2, + exclc + 2, + line, + ) + self.warnskip(line) + + self.errc += 1 + return + + s = False + for col in line.columns[2:]: + if not col in self.vars_ex[tp]: + self.lefl(f"{col} has not been defined as an extra colunm", line) + self.errc += 1 + s = True + + if s: + self.warnskip(line) + return + + self.types_ex[tp].clear() + for col in line.columns[2:]: + self.types_ex[tp].append(self.vars_ex[tp][col]) + + def compare_haps_to_pvar( + self, var_ids: list[str], underscores_to_semicolons: bool = False + ): + ids: set[tuple[str, Line]] = set() + for chrom, dt in self.vrids.items(): + for k, l in dt.items(): + ids.add( + (k if not underscores_to_semicolons else k.replace("_", ":"), l) + ) + + for id, l in ids: + if id not in var_ids: + self.lefl(f"Could not find variant id {id} in the .pvar file!", l) + + self.errc += 1 + + # + # Logging + # + + def lefl(self, msg: str, line: Line, sep: str = "\n"): + self.log.error(f"{msg}{sep}At line #{line.number}: {line}") + + def lwfl(self, msg: str, line: Line, sep: str = "\n"): + self.log.warning(f"{msg}{sep}At line #{line.number}: {line}") + + def lwexfl(self, msg: str, exp: object, rec: object, line: Line, sep: str = "\n"): + self.log.warning( + f"{msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}" + ) + + def leexfl(self, msg: str, exp: object, rec: object, line: Line, sep: str = "\n"): + self.log.error( + f"{msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}" + ) + + def warnskip(self, line: Line): + self.log.warning(f"Skipping line #{line.number}") + + +def is_hapfile_valid( + filename: Path, + sorted: bool = False, + pvar: Path = None, + log: logging.Logger = None, +) -> bool: + """ + Checks whether a file is properly formatted + + Logs suggestions (warnings and errors) if it isn't + + Parameters + ---------- + filename : Path + The path to the file + sorted : bool, optional + Whether the file can be assumed to be sorted already + pvar : Path, optional + Path to a PVAR file with SNPs from the .hap file + log: logging.Logger, optional + A logging module to which to write messages about progress and any errors + + Returns + ------- + bool + True if the file is formatted correctly and False otherwise + """ + if log == None: + log = getLogger(name=LOGGER_NAME, level="CRITICAL") + + file = HapFileIO(filename, logger=log) + + is_readable = file.validate_existence() + + if not is_readable: + return False + + hapfile = HapFileValidator(logger=log) + hapfile.extract_and_store_content(file, sorted=sorted) + + hapfile.store_ids() + + hapfile.validate_column_additions() + + hapfile.validate_columns_fulfill_minreqs() + hapfile.validate_variant_ids() + + hapfile.reorder_extra_fields() + + hapfile.validate_extra_fields() + + hapfile.validate_version_declarations() + + variants = set() + + if pvar is not None: + varfile = GenotypesPLINK(pvar.with_suffix(".pgen")) + varfile.read_variants(variants=variants) + + # TODO: do this quicker by just checking whether the sets intersect + ids = list(map(lambda v: v[0], varfile.variants)) + hapfile.compare_haps_to_pvar(ids) + + log.info( + f"Completed .hap file validation with {hapfile.errc} errors and" + f" {hapfile.warc} warnings." + ) + + return hapfile.errc == 0 and hapfile.warc == 0 diff --git a/tests/data/empty.hap b/tests/data/empty.hap new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/tests/data/empty.hap @@ -0,0 +1 @@ + diff --git a/tests/data/simple.hap b/tests/data/simple.hap index bc327263..55ac2c74 100644 --- a/tests/data/simple.hap +++ b/tests/data/simple.hap @@ -2,7 +2,7 @@ # version 0.2.0 #H ancestry s Local ancestry #H beta .2f Effect size in linear model -H 1 10114 8 H1 YRI 0.75 +H 1 10114 10118 H1 YRI 0.75 H 1 10114 10119 H2 YRI 0.5 H 1 10116 10119 H3 ASW 0.25 V H1 10114 10115 1:10114:T:C T diff --git a/tests/data/validate/10_extras_reordered.hap b/tests/data/validate/10_extras_reordered.hap new file mode 100644 index 00000000..f22c29da --- /dev/null +++ b/tests/data/validate/10_extras_reordered.hap @@ -0,0 +1,21 @@ +# version 0.2.0 +#H extra4 s +#H extra1 s +#H extra2 s +#H extra3 s +#H extra0 d +#H extra5 s +#H extra6 s +#H extra9 d +#H extra8 s +#H extra7 s +# orderH extra0 extra1 extra2 extra3 extra4 extra5 extra6 extra7 extra8 extra9 +H 1 10114 10118 H1 0 b c d e f g h i 9 +H 1 10114 10119 H2 0 b c d e f g h i 9 +H 1 10116 10119 H3 0 b c d e f g h i 9 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/basic.hap b/tests/data/validate/basic.hap new file mode 100644 index 00000000..42ed6568 --- /dev/null +++ b/tests/data/validate/basic.hap @@ -0,0 +1,6 @@ +# version 0.2.0 +H 21 100 110 haplotype_1 +H 21 110 125 haplotype_2 +V haplotype_1 100 101 variant_1 C +V haplotype_2 110 111 variant_2 A + diff --git a/tests/data/validate/basic.hap.gz b/tests/data/validate/basic.hap.gz new file mode 100644 index 00000000..30f9ed97 Binary files /dev/null and b/tests/data/validate/basic.hap.gz differ diff --git a/tests/data/validate/basic.hap.gz.tbi b/tests/data/validate/basic.hap.gz.tbi new file mode 100644 index 00000000..0948234b Binary files /dev/null and b/tests/data/validate/basic.hap.gz.tbi differ diff --git a/tests/data/validate/basic.pvar b/tests/data/validate/basic.pvar new file mode 100644 index 00000000..2691d542 --- /dev/null +++ b/tests/data/validate/basic.pvar @@ -0,0 +1,5 @@ +##filedate=20230724 +##contig= +#CHROM POS ID REF ALT +21 100 variant_1 G C +21 110 variant_2 T A diff --git a/tests/data/validate/basic_missing_ids.pvar b/tests/data/validate/basic_missing_ids.pvar new file mode 100644 index 00000000..30c45929 --- /dev/null +++ b/tests/data/validate/basic_missing_ids.pvar @@ -0,0 +1,4 @@ +##filedate=20230724 +##contig= +#CHROM POS ID REF ALT +21 100 variant_1 G C diff --git a/tests/data/validate/correct.hap b/tests/data/validate/correct.hap new file mode 100644 index 00000000..717dce16 --- /dev/null +++ b/tests/data/validate/correct.hap @@ -0,0 +1,19 @@ +# orderH ancestry beta +# version 0.2.0 +#H ancestry s Local ancestry +#H beta .2f Effect size in linear model +#R beta .2f Effect size in linear model +H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 +R 21 26938353 26938400 21_26938353_STR 0.45 +H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 +H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 +V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C +V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C +V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G +V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A +V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T +V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A +V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A + diff --git a/tests/data/validate/duplicate_ids.hap b/tests/data/validate/duplicate_ids.hap new file mode 100644 index 00000000..9647dcf0 --- /dev/null +++ b/tests/data/validate/duplicate_ids.hap @@ -0,0 +1,5 @@ +# version 0.2.0 +H 21 101 110 H1 +H 21 111 120 H1 +H 21 121 130 H1 +V H1 22 99 V:21:22:99:C:G X diff --git a/tests/data/validate/duplicate_vids_per_haplotype.hap b/tests/data/validate/duplicate_vids_per_haplotype.hap new file mode 100644 index 00000000..e6af8d79 --- /dev/null +++ b/tests/data/validate/duplicate_vids_per_haplotype.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +H 21 101 110 H1 +V H1 33 33 V:21:22:99:C:G G +V H1 22 22 V:21:22:99:C:G G diff --git a/tests/data/validate/empty_lines.hap b/tests/data/validate/empty_lines.hap new file mode 100644 index 00000000..d19089d2 --- /dev/null +++ b/tests/data/validate/empty_lines.hap @@ -0,0 +1,7 @@ +# version 0.2.0 +H 21 100 110 haplotype_1 +H 21 110 125 haplotype_2 + +V haplotype_1 100 101 variant_1 C +V haplotype_2 110 111 variant_2 A + diff --git a/tests/data/validate/excol_of_wrong_type.hap b/tests/data/validate/excol_of_wrong_type.hap new file mode 100644 index 00000000..f1cb3082 --- /dev/null +++ b/tests/data/validate/excol_of_wrong_type.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +#H extraField d +H 1 10114 10118 H1 NOT_AN_INTEGER +V H1 10116 10117 1:10116:A:G G \ No newline at end of file diff --git a/tests/data/validate/hrid_of_chromosome.hap b/tests/data/validate/hrid_of_chromosome.hap new file mode 100644 index 00000000..c4cfa33c --- /dev/null +++ b/tests/data/validate/hrid_of_chromosome.hap @@ -0,0 +1,8 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 1 +R 1 10116 10119 1 +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C diff --git a/tests/data/validate/inadequate_version.hap b/tests/data/validate/inadequate_version.hap new file mode 100644 index 00000000..010eadd1 --- /dev/null +++ b/tests/data/validate/inadequate_version.hap @@ -0,0 +1,10 @@ +# version 0.n.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/inadequate_version_columns.hap b/tests/data/validate/inadequate_version_columns.hap new file mode 100644 index 00000000..c712d1e1 --- /dev/null +++ b/tests/data/validate/inadequate_version_columns.hap @@ -0,0 +1,10 @@ +# version +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/inconvertible_ends.hap b/tests/data/validate/inconvertible_ends.hap new file mode 100644 index 00000000..77be1a42 --- /dev/null +++ b/tests/data/validate/inconvertible_ends.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +H 1 10114 101x19 H1 +R 1 10116 101x19 R1 +V H1 10114 10115 1:10114:T:C T diff --git a/tests/data/validate/inconvertible_ends_var.hap b/tests/data/validate/inconvertible_ends_var.hap new file mode 100644 index 00000000..9d6544aa --- /dev/null +++ b/tests/data/validate/inconvertible_ends_var.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +H 1 10114 10119 H1 +R 1 10116 10119 R1 +V H1 10114 101x15 1:10114:T:C T diff --git a/tests/data/validate/inconvertible_starts.hap b/tests/data/validate/inconvertible_starts.hap new file mode 100644 index 00000000..de0729c1 --- /dev/null +++ b/tests/data/validate/inconvertible_starts.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +H 1 101x14 10119 H1 +R 1 101x16 10119 R1 +V H1 10114 10115 1:10114:T:C T diff --git a/tests/data/validate/inconvertible_starts_var.hap b/tests/data/validate/inconvertible_starts_var.hap new file mode 100644 index 00000000..1b61ba06 --- /dev/null +++ b/tests/data/validate/inconvertible_starts_var.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +H 1 10114 10119 H1 +R 1 10116 10119 R1 +V H1 101x14 10115 1:10114:T:C T diff --git a/tests/data/validate/insufficient_columns.hap b/tests/data/validate/insufficient_columns.hap new file mode 100644 index 00000000..8a99185b --- /dev/null +++ b/tests/data/validate/insufficient_columns.hap @@ -0,0 +1,16 @@ +# version 0.2.0 +H +H 1 +H 1 10114 +H 1 10114 10118 +H 1 10114 10118 H1 +R +R 1 +R 1 10115 +R 1 10115 10119 +R 1 10115 10119 R1 +V +V H1 +V H1 10117 +V H1 10117 10118 +V H1 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/insufficient_excols_in_reorder.hap b/tests/data/validate/insufficient_excols_in_reorder.hap new file mode 100644 index 00000000..5ca68ec4 --- /dev/null +++ b/tests/data/validate/insufficient_excols_in_reorder.hap @@ -0,0 +1,6 @@ +# version 0.2.0 +#H extraField1 d +#H extraField2 d +# orderH extraField1 +H 1 10114 10118 H1 NOT_AN_INTEGER +V H1 10116 10117 1:10116:A:G G \ No newline at end of file diff --git a/tests/data/validate/invalid_column_addition_column_count.hap b/tests/data/validate/invalid_column_addition_column_count.hap new file mode 100644 index 00000000..f28769d5 --- /dev/null +++ b/tests/data/validate/invalid_column_addition_column_count.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +#H justTheName +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/invalid_column_addition_data_types.hap b/tests/data/validate/invalid_column_addition_data_types.hap new file mode 100644 index 00000000..1562948f --- /dev/null +++ b/tests/data/validate/invalid_column_addition_data_types.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +#H invalidType x <- Wrong! +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/invalid_column_addition_types.hap b/tests/data/validate/invalid_column_addition_types.hap new file mode 100644 index 00000000..226ec1db --- /dev/null +++ b/tests/data/validate/invalid_column_addition_types.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +#X inv s This is invalid! +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/is_directory.hap/.gitkeep b/tests/data/validate/is_directory.hap/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/validate/multiple_order_defs.hap b/tests/data/validate/multiple_order_defs.hap new file mode 100644 index 00000000..1940d75b --- /dev/null +++ b/tests/data/validate/multiple_order_defs.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +#H a d +#H b d +#H c d +# orderH a b c +# orderH c b a +H 21 100 110 haplotype_1 1 2 3 +H 21 110 125 haplotype_2 3 2 1 +V haplotype_1 100 101 variant_1 C +V haplotype_2 110 111 variant_2 A + diff --git a/tests/data/validate/multiple_versions.hap b/tests/data/validate/multiple_versions.hap new file mode 100644 index 00000000..2cde8fb3 --- /dev/null +++ b/tests/data/validate/multiple_versions.hap @@ -0,0 +1,12 @@ +# version 0.1.0 +# version 0.2.0 +# version 0.3.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/no_version.hap b/tests/data/validate/no_version.hap new file mode 100644 index 00000000..32475610 --- /dev/null +++ b/tests/data/validate/no_version.hap @@ -0,0 +1,9 @@ +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/out_of_header_metas.hap b/tests/data/validate/out_of_header_metas.hap new file mode 100644 index 00000000..38676656 --- /dev/null +++ b/tests/data/validate/out_of_header_metas.hap @@ -0,0 +1,20 @@ +# orderH ancestry beta +# version 0.2.0 +#H ancestry s Local ancestry +#H beta .2f Effect size in linear model +#R beta .2f Effect size in linear model +H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 +R 21 26938353 26938400 21_26938353_STR 0.45 +H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 +H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 +# This should cause an error if the file is sorted +#V test_field s A field to test with +V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C +V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C +V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G +V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A +V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T +V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A +V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A diff --git a/tests/data/validate/simple.hap b/tests/data/validate/simple.hap new file mode 100644 index 00000000..55ac2c74 --- /dev/null +++ b/tests/data/validate/simple.hap @@ -0,0 +1,13 @@ +# orderH ancestry beta +# version 0.2.0 +#H ancestry s Local ancestry +#H beta .2f Effect size in linear model +H 1 10114 10118 H1 YRI 0.75 +H 1 10114 10119 H2 YRI 0.5 +H 1 10116 10119 H3 ASW 0.25 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/simple.pvar b/tests/data/validate/simple.pvar new file mode 100644 index 00000000..ef40d395 --- /dev/null +++ b/tests/data/validate/simple.pvar @@ -0,0 +1,7 @@ +##filedate=20180225 +##contig= +#CHROM POS ID REF ALT +1 10114 1:10114:T:C T C +1 10116 1:10116:A:G A G +1 10117 1:10117:C:A C A +1 10122 1:10122:A:G A G diff --git a/tests/data/validate/start_after_end.hap b/tests/data/validate/start_after_end.hap new file mode 100644 index 00000000..8cb2445f --- /dev/null +++ b/tests/data/validate/start_after_end.hap @@ -0,0 +1,10 @@ +# version 0.2.0 +H 1 10118 10114 H1 +H 1 10119 10114 H2 +H 1 10119 10116 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/unassociated_haplotype.hap b/tests/data/validate/unassociated_haplotype.hap new file mode 100644 index 00000000..e9214447 --- /dev/null +++ b/tests/data/validate/unassociated_haplotype.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +H 1 10123 10126 H4 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/unexistent_col_in_order.hap b/tests/data/validate/unexistent_col_in_order.hap new file mode 100644 index 00000000..571ca418 --- /dev/null +++ b/tests/data/validate/unexistent_col_in_order.hap @@ -0,0 +1,19 @@ +# orderH ancestry beta nothing +# version 0.2.0 +#H ancestry s Local ancestry +#H beta .2f Effect size in linear model +#R beta .2f Effect size in linear model +H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 +R 21 26938353 26938400 21_26938353_STR 0.45 +H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 +H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 +V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C +V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C +V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G +V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A +V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T +V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A +V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A + diff --git a/tests/data/validate/unexistent_fields.hap b/tests/data/validate/unexistent_fields.hap new file mode 100644 index 00000000..1b7c97f8 --- /dev/null +++ b/tests/data/validate/unexistent_fields.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A +X 10 15 144 XID1 diff --git a/tests/data/validate/unexistent_reorders.hap b/tests/data/validate/unexistent_reorders.hap new file mode 100644 index 00000000..5a395aec --- /dev/null +++ b/tests/data/validate/unexistent_reorders.hap @@ -0,0 +1,14 @@ +# version 0.2.0 +#H x d +#H y d +# Should error out! Z does not exist +# orderH x y z +H 1 10114 10118 H1 1 2 +H 1 10114 10119 H2 1 2 +H 1 10116 10119 H3 1 2 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/unrecognizable_allele.hap b/tests/data/validate/unrecognizable_allele.hap new file mode 100644 index 00000000..bf319414 --- /dev/null +++ b/tests/data/validate/unrecognizable_allele.hap @@ -0,0 +1,3 @@ +# version 0.2.0 +H 21 100 110 H1 +V H1 22 99 V:21:22:99:C:G X diff --git a/tests/data/validate/variant_id_of_chromosome.hap b/tests/data/validate/variant_id_of_chromosome.hap new file mode 100644 index 00000000..2a203be9 --- /dev/null +++ b/tests/data/validate/variant_id_of_chromosome.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +R 1 10116 10119 R1 +V H1 10114 10115 1 C +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/validate/variant_inexistent_haplotype_id.hap b/tests/data/validate/variant_inexistent_haplotype_id.hap new file mode 100644 index 00000000..dc420011 --- /dev/null +++ b/tests/data/validate/variant_inexistent_haplotype_id.hap @@ -0,0 +1,12 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +R 1 10116 10119 R1 +V H1 10114 10115 1 C +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A +V H4 10117 10118 1:10117:C:T T diff --git a/tests/test_data.py b/tests/test_data.py index 835b3867..13741531 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1152,7 +1152,7 @@ def _basic_unordered_first_field_haps(self): def _get_dummy_haps(self): # create three haplotypes haplotypes = { - "H1": Haplotype(chrom="1", start=10114, end=8, id="H1"), + "H1": Haplotype(chrom="1", start=10114, end=10118, id="H1"), "H2": Haplotype(chrom="1", start=10114, end=10119, id="H2"), "H3": Haplotype(chrom="1", start=10116, end=10119, id="H3"), } diff --git a/tests/test_validate.py b/tests/test_validate.py new file mode 100644 index 00000000..fc3fed0c --- /dev/null +++ b/tests/test_validate.py @@ -0,0 +1,344 @@ +from pathlib import Path + +from click.testing import CliRunner + +from haptools.__main__ import main +from haptools.validate import is_hapfile_valid + +PARENT_DATADIR = Path(__file__).parent.joinpath("data") +DATADIR = Path(__file__).parent.joinpath("data") / "validate" + + +def test_generated_haplotypes(): + """ + Tests the dummy .hap generated by the haptools test suite + """ + hapfile = Path(PARENT_DATADIR / "simple.hap") + pvarfile = Path(PARENT_DATADIR / "simple.pvar") + + assert is_hapfile_valid(hapfile, pvar=pvarfile) + + +def test_indexed(): + """ + Test an indexed .hap file + """ + assert is_hapfile_valid(DATADIR / "basic.hap.gz") + + +def test_with_empty_lines(): + """ + Tests a .hap with empty lines + """ + assert not is_hapfile_valid(DATADIR / "empty_lines.hap") + + +def test_with_out_of_header_metas_sorted(): + """ + Test a sorted .hap with meta lines out of the header + """ + assert is_hapfile_valid(DATADIR / "out_of_header_metas.hap", sorted=True) + + +def test_with_out_of_header_metas_unsorted(): + """ + Test an unsorted .hap with meta lines out of the header + """ + assert is_hapfile_valid(DATADIR / "out_of_header_metas.hap", sorted=False) + + +def test_with_10_extras_reordered(): + """ + Tests a .hap file with 10 extra columns + """ + assert is_hapfile_valid(DATADIR / "10_extras_reordered.hap") + + +def test_with_unexistent_reorders(): + """ + Tests a .hap with an order[H|R|V] which mentions a non-existent extra column + """ + assert not is_hapfile_valid(DATADIR / "unexistent_reorders.hap") + + +def test_with_unexistent_fields(): + """ + Tests a .hap with a data line that is not an H, R or V + """ + assert not is_hapfile_valid(DATADIR / "unexistent_fields.hap") + + +def test_with_inadequate_version(): + """ + Tests a .hap with an incorrectly formatted version + """ + assert not is_hapfile_valid(DATADIR / "inadequate_version.hap") + + +def test_with_no_version(): + """ + Tests a .hap with no present version + """ + assert not is_hapfile_valid(DATADIR / "no_version.hap") + + +def test_with_multiple_versions(): + """ + Tests a .hap with several versions present + """ + assert not is_hapfile_valid(DATADIR / "multiple_versions.hap") + + +def test_with_inadequate_version_columns(): + """ + Tests a .hap with a version column of only 2 fields + """ + assert not is_hapfile_valid(DATADIR / "inadequate_version_columns.hap") + + +def test_with_invalid_column_addition_column_count(): + """ + Tests a .hap with an extra column declaration of invalid column count + """ + assert not is_hapfile_valid(DATADIR / "invalid_column_addition_column_count.hap") + + +def test_with_invalid_column_addition_types(): + """ + Tests a .hap with a column addition for a type which is not H, R or V + """ + assert not is_hapfile_valid(DATADIR / "invalid_column_addition_types.hap") + + +def test_with_invalid_column_addition_data_types(): + """ + Tests a .hap with a column addition of unrecognized data type (not s, d or .nf) + """ + assert not is_hapfile_valid(DATADIR / "invalid_column_addition_data_types.hap") + + +def test_with_insufficient_columns(): + """ + Tests a .hap with insufficient mandatory columns + """ + assert not is_hapfile_valid(DATADIR / "insufficient_columns.hap") + + +def test_with_inconvertible_starts(): + """ + Tests a .hap with start positions that can't be converted to integers + """ + assert not is_hapfile_valid(DATADIR / "inconvertible_starts.hap") + + +def test_with_inconvertible_ends(): + """ + Tests a .hap with end positions that can't be converted to integers + """ + assert not is_hapfile_valid(DATADIR / "inconvertible_ends.hap") + + +def test_with_inconvertible_starts_var(): + """ + Tests a .hap with start positions that can't be converted to integers in variants + """ + assert not is_hapfile_valid(DATADIR / "inconvertible_starts_var.hap") + + +def test_with_inconvertible_ends_var(): + """ + Tests a .hap with end positions that can't be converted to integers in variants + """ + assert not is_hapfile_valid(DATADIR / "inconvertible_ends_var.hap") + + +def test_start_after_end(): + """ + Tests a .hap with the start position placed after the end position + """ + assert not is_hapfile_valid(DATADIR / "start_after_end.hap") + + +def test_is_directory(): + """ + Tests a validation command with a filename that points to a directory + """ + assert not is_hapfile_valid(DATADIR / "is_directory.hap") + + +def test_with_variant_id_of_chromosome(): + """ + Tests a .hap with a variant whose ID is the same as a chromosome ID + """ + assert not is_hapfile_valid(DATADIR / "variant_id_of_chromosome.hap") + + +def test_with_hrid_of_chromosome(): + """ + Tests a .hap with a haplotype or repeat with the same ID as a chromosome + """ + assert not is_hapfile_valid(DATADIR / "hrid_of_chromosome.hap") + + +def test_with_unexistent_col_in_order(): + """ + Tests a .hap with an order[H|R|V] field that references a non-existent extra column name + """ + assert not is_hapfile_valid(DATADIR / "unexistent_col_in_order.hap") + + +def test_with_unassociated_haplotype(): + """ + Tests a .hap with a haplotype that does not have at least one matching repeat + """ + assert not is_hapfile_valid(DATADIR / "unassociated_haplotype.hap") + + +def test_with_unrecognizable_allele(): + """ + Tests a .hap with a variant whose allele is not G, C, T or A + """ + assert not is_hapfile_valid(DATADIR / "unrecognizable_allele.hap") + + +def test_with_duplicate_ids(): + """ + Tests a .hap with duplicate IDs for H and R fields + """ + assert not is_hapfile_valid(DATADIR / "duplicate_ids.hap") + + +def test_with_duplicate_vids_per_haplotype(): + """ + Tests a .hap with duplicate IDs for variants with the same haplotype association + """ + assert not is_hapfile_valid(DATADIR / "duplicate_vids_per_haplotype.hap") + + +def test_with_excol_of_wrong_type(): + """ + Tests a .hap with a data line which contains an extra column of d data type but receives s + """ + assert not is_hapfile_valid(DATADIR / "excol_of_wrong_type.hap") + + +def test_with_multiple_order_defs(): + """ + Tests a .hap with multiple order[H|R|V] of the same type + """ + assert not is_hapfile_valid(DATADIR / "multiple_order_defs.hap") + + +def test_with_insufficient_excols_in_reorder(): + """ + Tests a .hap with an order[H|R|V] that does not reference all extra columns + """ + assert not is_hapfile_valid(DATADIR / "insufficient_excols_in_reorder.hap") + + +def test_with_variant_inexistent_haplotype_id(): + """ + Tests a .hap with with a variant that references a non-existent haplotype + """ + assert not is_hapfile_valid(DATADIR / "variant_inexistent_haplotype_id.hap") + + +def test_with_missing_variant_in_pvar(): + """ + Tests a .hap along with a .pvar file which is missing an ID present in the .hap + """ + assert not is_hapfile_valid( + DATADIR / "simple.hap", + pvar=DATADIR / "basic_missing_ids.pvar", + ) + + +def test_unreadable_hapfile(): + """ + Passes a non-existent file to the validator + """ + assert not is_hapfile_valid(Path("NON_EXISTENT_FILENAME.hap")) + + +def test_leading_trailing_whitespace(): + """ + We should fail if lines have any leading or trailing whitespace + """ + basic = DATADIR / "basic.hap" + with open(basic, "r") as basic_file: + basic_lines = basic_file.readlines() + + temp_basic = DATADIR / "leading_trailing_whitespace.hap" + lines = (0, 2, 3, 5) + + # test both kinds of whitespace: tabs and spaces + for space_kind in (" ", "\t"): + # test leading whitespace + for line in lines: + with open(temp_basic, "w") as temp_basic_file: + new_lines = basic_lines.copy() + new_lines[line] = space_kind + new_lines[line] + temp_basic_file.writelines(new_lines) + assert not is_hapfile_valid(temp_basic) + # test trailing whitespace + for line in lines: + with open(temp_basic, "w") as temp_basic_file: + new_lines = basic_lines.copy() + new_lines[line] = new_lines[line][:-1] + space_kind + "\n" + temp_basic_file.writelines(new_lines) + assert not is_hapfile_valid(temp_basic) + + # also try adding a space next to a tab + with open(temp_basic, "w") as temp_basic_file: + new_lines = basic_lines.copy() + new_lines[1] = new_lines[1][:2] + " " + new_lines[1][2:] + temp_basic_file.writelines(new_lines) + assert not is_hapfile_valid(temp_basic) + + temp_basic.unlink() + + +def test_basic(capfd): + hp_file = DATADIR / "basic.hap" + + cmd = f"validate {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code == 0 + + +def test_no_version(capfd): + hp_file = DATADIR / "no_version.hap" + + cmd = f"validate {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code != 0 + + +def test_no_version(capfd): + hp_file = DATADIR / "no_version.hap" + + cmd = f"validate {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code != 0 + + +def test_sorted(capfd): + hp_file = PARENT_DATADIR / "simple.hap" + + cmd = f"validate --sorted {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code == 0 + + +def test_with_pvar(capfd): + gt_file = PARENT_DATADIR / "simple.pvar" + hp_file = PARENT_DATADIR / "simple.hap" + + cmd = f"validate --genotypes {gt_file} {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code == 0