From 2b9f7088c8efc1d09a3db37350888252e3343c1a Mon Sep 17 00:00:00 2001 From: Ayimany Date: Fri, 14 Jul 2023 17:52:31 -0600 Subject: [PATCH 01/44] Create base for hapfile validation This base is still missing some features Still not implemented as a cli switch Refer to #47 --- haptools/val_hapfile.py | 463 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100644 haptools/val_hapfile.py diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py new file mode 100644 index 00000000..156dba51 --- /dev/null +++ b/haptools/val_hapfile.py @@ -0,0 +1,463 @@ + +from pathlib import Path +import re +from sys import stderr, argv +from os import R_OK, access + + +def error_expected(what : str, got : str, linen : int) -> None: + print(f">>> Expected {what} but got {got}", file = stderr) + print(f">>> At line {linen}\n", file = stderr) + + +def error_expecting_cols(cols : list[str]) -> None: + print(f">>>>> Expecting: {cols}\n", file = stderr) + + +def is_line_meta(line : str) -> bool: + return line.startswith("#") + + +def is_line_regular(line : str) -> bool: + return not is_line_meta(line) + + +def is_convertible_to_type(tp : type, what : str) -> bool: + if tp == int: + return what.isdigit() + + elif tp == float: + return what.isdigit() + + elif tp == str: + return True + + return False + + + +def assert_file_exists(path : Path) -> None: + if (path.exists()): + return + + print(f">>> Could not open {path}: The file does not exist", file = stderr) + + +def assert_file_is_regular(path : Path) -> None: + if path.is_file(): + return + + print(f">>> Failed to read {path}: It is not a regular file.") + quit(1) + + +def assert_file_is_readable(path : Path) -> None: + if access(path, R_OK): + return + + print(f">>> Failed to read {path}:Not enough permissions.") + quit(1) + + +def read_file_lines_protected(filename : str) -> list[str]: + content : list[str] + path : Path = Path(filename) + + assert_file_exists(path) + assert_file_is_regular(path) + assert_file_is_readable(path) + + buffer = open(path) + + content = buffer.readlines() + + buffer.close() + + return content + + +class Columns: + + + def __init__(self, content : list[str]): + self.content : list[str] = content + self.count : int = len(content) + + + def assert_length_eq(self, length : int) -> bool: + if self.count == length: + return True + + return False + + + def assert_length_gte(self, length : int) -> bool: + if self.count >= length: + return True + + return False + + + def get(self, index : int) -> str: + if index >= self.count: + return "" + + return self.content[index] + + +class Line: + + + def __init__(self, number : int, content : str): + self.number : int = number + self.content : str = content + self.columns : Columns | None = None + + self.fatal : int = 0 + self.warning : int = 0 + + + def split_and_save(self) -> None: + self.columns = Columns(self.content.split()) + + + def as_columns(self) -> Columns: + if (self.columns == None): + self.split_and_save() + assert self.columns != None + + return self.columns + + + def is_flawed(self) -> bool: + return self.is_wrong() or self.is_ill_formed() + + + def is_wrong(self) -> bool: + return self.fatal > 0 + + + def is_ill_formed(self) -> bool: + return self.warning > 0 + + + def err(self) -> None: + self.fatal += 1 + + + def warn(self) -> None: + self.warning += 1 + + def display_flaws_if_present(self) -> None: + if not self.is_flawed(): + return + + print(f">>>>>>> {self.fatal} error(s) and {self.warning} warning(s) emmited for line #{self.number}", file = stderr) + print(f">>>>>>> {self.content}", file = stderr) + + +class HapFile: + + + KEYWORD_VERSION = r"version" + KEYWORD_VERSION_REGEX = r"\d+\.\d+\.\d+" + KEYWORD_ORDER_REGEX = r"order." + + NONE : int = 0 + HAPLOTYPE : int = 1 + REPEAT : int = 2 + VARIANT : int = 3 + + MANDATORY_COLUMNS_HAPLOTYPE : int = 5 + MANDATORY_COLUMNS_REPEAT : int = 5 + MANDATORY_COLUMNS_VARIANT : int = 6 + MANDATORY_COLUMNS_FIELD_DEF : int = 3 + MANDATORY_COLUMNS_VERSION_DEF : int = 3 + + KEY_LINE_TYPE = "HT_LINE_TYPE" + KEY_CHROMOSOME_ID = "HT_CHROMOSOME_ID" + KEY_START_POSITION = "HT_START_POSITION" + KEY_END_POSITION = "HT_END_POSITION" + KEY_ID = "HT_ID" + KEY_ALLELE = "HT_ALLELE" + + + CHARACTER_TYPE_ASSOCIATIONS : dict[str, int] = { + "H" : HAPLOTYPE, + "R" : REPEAT, + "V" : VARIANT + } + + CHARACTER_PYTYPE_ASSOCIATIONS : dict[str, type] = { + "d" : int, + "f" : float, + "s" : str, + } + + + DEFAULT_HEADER : dict[int, list[tuple[str, type]]] = { + HAPLOTYPE: [ + (KEY_LINE_TYPE, str), + (KEY_CHROMOSOME_ID, str), + (KEY_START_POSITION, int), + (KEY_END_POSITION, int), + (KEY_ID, str) + ], + + REPEAT: [ + (KEY_LINE_TYPE, str), + (KEY_CHROMOSOME_ID, str), + (KEY_START_POSITION, int), + (KEY_END_POSITION, int), + (KEY_ID, str) + ], + + VARIANT: [ + (KEY_LINE_TYPE, str), + (KEY_CHROMOSOME_ID, str), + (KEY_START_POSITION, int), + (KEY_END_POSITION, int), + (KEY_ID, str), + (KEY_ALLELE, str) + ] + } + + DEFAULT_TABLE : dict[int, list[list[str]]] = { + HAPLOTYPE : [], + REPEAT : [], + VARIANT : [] + } + + @staticmethod + def get_associated_hapfile_type_from_str(s : str) -> int | None: + return HapFile.CHARACTER_TYPE_ASSOCIATIONS.get(s.upper()) + + @staticmethod + def get_associated_pytype_from_str(s : str) -> type | None: + return HapFile.CHARACTER_PYTYPE_ASSOCIATIONS.get(s[len(s) - 1]) + + + def __init__(self): + self.header : dict[int, list[tuple[str, type]]] = HapFile.DEFAULT_HEADER + self.tensor : dict[int, list[list[str]]] = HapFile.DEFAULT_TABLE + self.version : str | None = None + + self.fatal_errors : int = 0 + self.warnings : int = 0 + + + # + # Reading + # + + + def read_file(self, filename : str) -> None: + file_content = read_file_lines_protected(filename) + + header_lines : list[Line] = [] + data_lines : list[Line] = [] + + linen = 0 + + for line in file_content: + linen += 1 + + if line.isspace(): continue + + if is_line_meta(line): + header_lines.append(Line(linen, line)) + else: + data_lines.append(Line(linen, line)) + + + self.read_into_header(header_lines) + self.read_into_matrix(data_lines) + + + # + # Header + # + + + def read_into_header(self, values : list[Line]) -> None: + for line in values: + self.parse_meta_line(line) + line.display_flaws_if_present() + + + def parse_meta_line(self, line : Line): + columns : Columns = line.as_columns() + + if len(columns.get(0)) < 2: + self.parse_comment_or_meta(line) + + else: + self.parse_column_addition(line) + + # + # C1 Is Just "#" + # + + + def parse_comment_or_meta(self, line : Line) -> None: + columns : Columns = line.as_columns() + + if columns.count < 2: return + metatype_column = columns.get(1) + + if metatype_column == HapFile.KEYWORD_VERSION: + self.parse_version(line) + + elif re.search(HapFile.KEYWORD_ORDER_REGEX, metatype_column): + self.set_column_order(line) + + + # + # Version Key Present + # + + + def parse_version(self, line: Line) -> None: + columns = line.as_columns() + + s = columns.assert_length_eq(HapFile.MANDATORY_COLUMNS_VERSION_DEF) + if not s: + line.err() + error_expected(f"{HapFile.MANDATORY_COLUMNS_VERSION_DEF} columns for version definition", str(columns.count), line.number) + error_expecting_cols(["Hash (#)", "version", ""]) + self.skip_due_to_errors_emmited(line) + return + + version = line.as_columns().get(2) + if (re.search(HapFile.KEYWORD_VERSION_REGEX, version)) == None: + error_expected("a version whose format conforms to \"x.x.x\" where \"x\" is an integer", f"\"{version}\"", line.number) + line.warn() + + self.version = version + + # + # orderX present + # + + + def set_column_order(self, line : Line) -> None: + columns : Columns = line.as_columns() + + order_x = columns.get(1) + hapfile_type_str = order_x[5:] + + tp = HapFile.CHARACTER_TYPE_ASSOCIATIONS.get(hapfile_type_str) + + if (tp == None): + error_expected(f"one of {list(HapFile.CHARACTER_TYPE_ASSOCIATIONS.keys())} as the type for an order definition", f"\"{hapfile_type_str}\"", line.number) + + print("CALL set_column_order ") + + + # + # C1 is #[...] + # + + + def parse_column_addition(self, line: Line) -> None: + columns : Columns = line.as_columns() + + success = columns.assert_length_gte(HapFile.MANDATORY_COLUMNS_FIELD_DEF) + if not success: + line.err() + error_expected(f"{HapFile.MANDATORY_COLUMNS_FIELD_DEF} columns for extra field definition", str(columns.count), line.number) + error_expecting_cols(["Hash & Type (#X)", "Name", "Data Type Format (s, d, .xf)", "Optional Description"]) + self.skip_due_to_errors_emmited(line) + return + + hapfile_type_str = columns.get(0)[1:] + hapfile_type = HapFile.get_associated_hapfile_type_from_str(hapfile_type_str) + + if (hapfile_type == None): + line.warn() + error_expected(f"one of {list(HapFile.CHARACTER_TYPE_ASSOCIATIONS.keys())} for the line type when adding an extra field", f"\"{hapfile_type_str}\"", line.number) + + python_type = HapFile.get_associated_pytype_from_str(columns.get(2)) + + if python_type == None: + line.warn() + error_expected(f"one of {list(HapFile.CHARACTER_PYTYPE_ASSOCIATIONS.keys())} for the data type when adding an extra field", f"\"{columns.get(2)}\"", line.number) + + if (line.is_flawed()): + self.skip_due_to_errors_emmited(line) + return + + assert hapfile_type != None + assert python_type != None + + self.header[hapfile_type].append((columns.get(1), python_type)) + + + # + # Matrix + # + + + def read_into_matrix(self, values : list[Line]) -> None: + for line in values: + self.parse_data_line(line) + line.display_flaws_if_present() + + + def parse_data_line(self, line : Line) -> None: + columns = line.as_columns() + + hapfile_type_str : str | None = columns.get(0) + hapfile_type = self.get_associated_hapfile_type_from_str(hapfile_type_str) + + if hapfile_type == None: + line.err() + error_expected(f"one of {list(HapFile.CHARACTER_TYPE_ASSOCIATIONS.keys())} when defining data", hapfile_type_str, line.number) + return + + self.store_line_into_matrix(hapfile_type, line) + self.validate_line_in_matrix(hapfile_type, line) + + + def store_line_into_matrix(self, hftp : int, line : Line): + columns : Columns = line.as_columns() + type_header : list[tuple[str, type]] = self.header[hftp] + + if (columns.count != len(type_header)): + line.warn() + + hftpci : int = list(HapFile.CHARACTER_TYPE_ASSOCIATIONS.values()).index(hftp) + hftpc : str = list(HapFile.CHARACTER_TYPE_ASSOCIATIONS.keys())[hftpci] + + error_expected(f"{len(type_header)} columns for \"{hftpc}\" entry", str(columns.count), line.number) + + matrix = self.tensor[hftp] + matrix.append(columns.content) + + + def validate_line_in_matrix(self, hftp : int, line : Line): + columns : Columns = line.as_columns() + type_header : list[tuple[str, type]] = self.header[hftp] + + if line.is_flawed(): + self.skip_due_to_errors_emmited(line) + return + + for i, col in enumerate(type_header): + entry = columns.content[i] + + if not is_convertible_to_type(col[1], entry): + error_expected(f"a(n) {str(col[1])[7:-1]} for column \"{col[0]}\"", entry, line.number) + line.warn() + + + + # + # Extra + # + + + def skip_due_to_errors_emmited(self, line : Line) -> None: + print(f">>>>> Skipping line #{line.number}.", file = stderr) + + From 6bd77fc96f217f63884c3ca5b2b8e3ef60a4b848 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Thu, 20 Jul 2023 11:00:11 -0600 Subject: [PATCH 02/44] Solidify and improve validator base Supports many more features Simpler codebase Straight to the point Logs many more errors Still in development: missing some features --- haptools/val_hapfile.py | 738 ++++++++++++++++++++++++---------------- 1 file changed, 442 insertions(+), 296 deletions(-) diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index 156dba51..2308934b 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -1,463 +1,609 @@ +import os +from logging import Logger, getLogger +from re import search -from pathlib import Path -import re -from sys import stderr, argv -from os import R_OK, access +VALHAP_LOGGER_NAME = "Hapfile Validation" +LTS_SPEC = "0.2.0" +TRAIL = "\n>>>" -def error_expected(what : str, got : str, linen : int) -> None: - print(f">>> Expected {what} but got {got}", file = stderr) - print(f">>> At line {linen}\n", file = stderr) +def tmpex(expectation : object, received : object) -> str: + return f"Expected: {expectation}\nReceived: {received}" -def error_expecting_cols(cols : list[str]) -> None: - print(f">>>>> Expecting: {cols}\n", file = stderr) - +class Line: -def is_line_meta(line : str) -> bool: - return line.startswith("#") + def __init__(self, content : str, number : int): + self.content : str = content + self.number : int = number -def is_line_regular(line : str) -> bool: - return not is_line_meta(line) + self.columns : list[str] = content.split() + self.count : int = len(self.columns) -def is_convertible_to_type(tp : type, what : str) -> bool: - if tp == int: - return what.isdigit() + def is_empty(self) -> bool: + return self.count == 0 - elif tp == float: - return what.isdigit() - elif tp == str: - return True + def __getitem__(self, index : int) -> str: + return self.columns[index] - return False + def __str__(self) -> str: + return self.content -def assert_file_exists(path : Path) -> None: - if (path.exists()): - return +class HapFileIO: - print(f">>> Could not open {path}: The file does not exist", file = stderr) + def __init__(self, filename : str, logger : Logger = getLogger(VALHAP_LOGGER_NAME)): + self.filename = filename + self.logger = logger -def assert_file_is_regular(path : Path) -> None: - if path.is_file(): - return - print(f">>> Failed to read {path}: It is not a regular file.") - quit(1) + def lines(self, sorted : bool = True) -> list[Line]: + buffer = open(self.filename) + content = [Line(line.strip(), i + 1) for i, line in enumerate(buffer.readlines())] + content = list(filter(lambda line : not line.is_empty(), content)) -def assert_file_is_readable(path : Path) -> None: - if access(path, R_OK): - return + buffer.close() - print(f">>> Failed to read {path}:Not enough permissions.") - quit(1) + if not sorted: + meta_limit = next(idx for idx, line in enumerate(content) if not line[0].startswith('#')) + content = [line for idx, line in enumerate(content) if (not line[0].startswith('#')) or idx < meta_limit] + # lol + content.sort(key = lambda line : ord(line[0][0])) -def read_file_lines_protected(filename : str) -> list[str]: - content : list[str] - path : Path = Path(filename) + return content - assert_file_exists(path) - assert_file_is_regular(path) - assert_file_is_readable(path) - buffer = open(path) + def validate_existence(self) -> bool: + if not self.exists(): + self.logger.error(f"The file {self.filename} does not exist.") + return False - content = buffer.readlines() + is_ok = True - buffer.close() + if not self.is_regular(): + self.logger.error(f"Cannot read {self.filename}: Is not a regular file.") + is_ok = False - return content + if not self.is_readable(): + self.logger.error(f"Cannot read {self.filename}: Insufficient permissions.") + is_ok = False + return is_ok -class Columns: + def exists(self) -> bool: + return os.path.exists(self.filename) - def __init__(self, content : list[str]): - self.content : list[str] = content - self.count : int = len(content) + def is_regular(self): + return os.path.isfile(self.filename) - def assert_length_eq(self, length : int) -> bool: - if self.count == length: - return True - - return False + def is_readable(self) -> bool: + return os.access(self.filename, os.R_OK) - def assert_length_gte(self, length : int) -> bool: - if self.count >= length: - return True - return False +class HapFile: - def get(self, index : int) -> str: - if index >= self.count: - return "" + # H CHROM START END ID + MANDATORY_HAPLOTYPE_COLUMN_COUNT : int = 5 - return self.content[index] + # R CHROM START END ID LN + MANDATORY_REPEAT_COLUMN_COUNT : int = 5 + # V CHROM START END ID CHROM LN + MANDATORY_VARIANT_COLUMN_COUNT : int = 6 -class Line: + # # version + MANDATORY_VERSION_COLUMNS : int = 3 + # #X Name Type [Description] + MANDATORY_DEFINITION_COLUMNS = 3 - def __init__(self, number : int, content : str): - self.number : int = number - self.content : str = content - self.columns : Columns | None = None + KEY_HAPLOTYPE : int = 0 + KEY_REPEAT : int = 1 + KEY_VARIANT : int = 2 + KEY_VARIANT_SRC : int = 9 - self.fatal : int = 0 - self.warning : int = 0 + NAME_HAPLOTYPE = "Haplotype" + NAME_REPEAT = "Repeat" + NAME_VARIANT = "Variant" + KEY_KEY : str = "HT::Key" + KEY_CHROMOSOME : str = "HT::Chromosome" + KEY_START : str = "HT::Start" + KEY_END : str = "HT::End" + KEY_ID : str = "HT::ID" + KEY_ALLELE : str = "HT::Allele" - def split_and_save(self) -> None: - self.columns = Columns(self.content.split()) + DEFAULT_HEADER : dict[int, dict[str, type]] = { + KEY_HAPLOTYPE : {}, + KEY_REPEAT : {}, + KEY_VARIANT : {} + } + EMPTY_TYPES : dict[int, list[type]] = { + KEY_HAPLOTYPE : [], + KEY_REPEAT : [], + KEY_VARIANT : [] + } - def as_columns(self) -> Columns: - if (self.columns == None): - self.split_and_save() - assert self.columns != None + EMPTY_DATA : dict[int, list[Line]] = { + KEY_HAPLOTYPE : [], + KEY_REPEAT : [], + KEY_VARIANT : [] + } - return self.columns + EMPTY_HRIDS : dict[int, dict[str, Line]] = { + KEY_HAPLOTYPE : {}, + KEY_REPEAT : {}, + } + EMPTY_VRIDS : dict[str, dict[str, Line]] = { + } - def is_flawed(self) -> bool: - return self.is_wrong() or self.is_ill_formed() + EMPTY_META : list[Line] = [] - def is_wrong(self) -> bool: - return self.fatal > 0 + def __init__(self, logger : Logger = getLogger(VALHAP_LOGGER_NAME)): + self.logger : Logger = logger + self.vars_ex : dict[int, dict[str, type]] = HapFile.DEFAULT_HEADER + self.types_ex : dict[int, list[type]] = HapFile.EMPTY_TYPES - def is_ill_formed(self) -> bool: - return self.warning > 0 + self.meta : list[Line] = HapFile.EMPTY_META + self.data : dict[int, list[Line]] = HapFile.EMPTY_DATA + self.hrids : dict[int, dict[str, Line]] = HapFile.EMPTY_HRIDS + self.vrids : dict[str, dict[str, Line]] = HapFile.EMPTY_VRIDS - def err(self) -> None: - self.fatal += 1 + self.referenced_chromosomes : set[str] = set() - - def warn(self) -> None: - self.warning += 1 - def display_flaws_if_present(self) -> None: - if not self.is_flawed(): - return + def extract_and_store_content(self, file : HapFileIO, sorted : bool = True): + lines = file.lines(sorted = sorted) - print(f">>>>>>> {self.fatal} error(s) and {self.warning} warning(s) emmited for line #{self.number}", file = stderr) - print(f">>>>>>> {self.content}", file = stderr) + self.extract_meta_lines(lines) + self.extract_data_lines(lines) -class HapFile: + def extract_meta_lines(self, lines : list[Line]): + header_limit = next(i for i, line in enumerate(lines) if not line[0].startswith('#')) + self.meta_lines = lines[:header_limit] - KEYWORD_VERSION = r"version" - KEYWORD_VERSION_REGEX = r"\d+\.\d+\.\d+" - KEYWORD_ORDER_REGEX = r"order." + def extract_data_lines(self, lines : list[Line]): + limits = [0, 0, 0] + for i, char in enumerate(['H', 'R', 'V']): + limits[i] = next(i for i, line in enumerate(lines) if line[0].startswith(char)) - NONE : int = 0 - HAPLOTYPE : int = 1 - REPEAT : int = 2 - VARIANT : int = 3 + ln = [lines[limits[0] : limits[1]], lines[limits[1] : limits[2]], lines[limits[2] : ]] - MANDATORY_COLUMNS_HAPLOTYPE : int = 5 - MANDATORY_COLUMNS_REPEAT : int = 5 - MANDATORY_COLUMNS_VARIANT : int = 6 - MANDATORY_COLUMNS_FIELD_DEF : int = 3 - MANDATORY_COLUMNS_VERSION_DEF : int = 3 + for i in range(HapFile.KEY_HAPLOTYPE, HapFile.KEY_VARIANT + 1): + self.data[i] = ln[i] - KEY_LINE_TYPE = "HT_LINE_TYPE" - KEY_CHROMOSOME_ID = "HT_CHROMOSOME_ID" - KEY_START_POSITION = "HT_START_POSITION" - KEY_END_POSITION = "HT_END_POSITION" - KEY_ID = "HT_ID" - KEY_ALLELE = "HT_ALLELE" + # + # Version Validation + # + + def validate_version_declarations(self): + versions = self.extract_version_declarations() + if len(versions) == 0: + self.logger.warn(f"{TRAIL} No version declaration found. Assuming to use the latest version.") - CHARACTER_TYPE_ASSOCIATIONS : dict[str, int] = { - "H" : HAPLOTYPE, - "R" : REPEAT, - "V" : VARIANT - } + for version in versions: + self.validate_version_format(version) - CHARACTER_PYTYPE_ASSOCIATIONS : dict[str, type] = { - "d" : int, - "f" : float, - "s" : str, - } + def extract_version_declarations(self) -> list[Line]: + decls = list(filter(lambda x : x.count > 1 and x[1] == "version", self.meta_lines)) - DEFAULT_HEADER : dict[int, list[tuple[str, type]]] = { - HAPLOTYPE: [ - (KEY_LINE_TYPE, str), - (KEY_CHROMOSOME_ID, str), - (KEY_START_POSITION, int), - (KEY_END_POSITION, int), - (KEY_ID, str) - ], - - REPEAT: [ - (KEY_LINE_TYPE, str), - (KEY_CHROMOSOME_ID, str), - (KEY_START_POSITION, int), - (KEY_END_POSITION, int), - (KEY_ID, str) - ], - - VARIANT: [ - (KEY_LINE_TYPE, str), - (KEY_CHROMOSOME_ID, str), - (KEY_START_POSITION, int), - (KEY_END_POSITION, int), - (KEY_ID, str), - (KEY_ALLELE, str) - ] - } + if len(decls) > 1: + self.logger.warn(f"{TRAIL} Found more than one " + "version declaration.") - DEFAULT_TABLE : dict[int, list[list[str]]] = { - HAPLOTYPE : [], - REPEAT : [], - VARIANT : [] - } + for decl in decls: + self.lwfl("", decl, sep = "") - @staticmethod - def get_associated_hapfile_type_from_str(s : str) -> int | None: - return HapFile.CHARACTER_TYPE_ASSOCIATIONS.get(s.upper()) + return decls - @staticmethod - def get_associated_pytype_from_str(s : str) -> type | None: - return HapFile.CHARACTER_PYTYPE_ASSOCIATIONS.get(s[len(s) - 1]) + def validate_version_format(self, version : Line): + if version.count < 3: + self.leexfl("Not enough columns in version declaration", + HapFile.MANDATORY_DEFINITION_COLUMNS, + version.count, + version) + self.logger.warning(f"Skipping line #{version.number}") - def __init__(self): - self.header : dict[int, list[tuple[str, type]]] = HapFile.DEFAULT_HEADER - self.tensor : dict[int, list[list[str]]] = HapFile.DEFAULT_TABLE - self.version : str | None = None + return - self.fatal_errors : int = 0 - self.warnings : int = 0 + if search(r"\d+\.\d+\.\d+", version[2]) == None: + self.lwexfl("Version is incorrectly formatted", + "'x.x.x' where 'x' is an integer", + version[2], + version) # - # Reading + # Column additions # - def read_file(self, filename : str) -> None: - file_content = read_file_lines_protected(filename) + def validate_column_additions(self): + additions = self.find_column_additions() - header_lines : list[Line] = [] - data_lines : list[Line] = [] + for i, k in enumerate(["#H", "#R", "#V"]): + self.add_column_additions_to_header(i, list(filter( + lambda line: line[0] == k, + additions))) - linen = 0 - for line in file_content: - linen += 1 + def find_column_additions(self) -> list[Line]: + additions = list(filter( + lambda line : search(r"#[H|R|V]", line[0]) != None, + self.meta_lines)) - if line.isspace(): continue + return additions - if is_line_meta(line): - header_lines.append(Line(linen, line)) - else: - data_lines.append(Line(linen, line)) + def add_column_additions_to_header(self, tp : int, additions : list[Line]): + for addition in additions: + if addition.count < 3: + self.lwexfl("Insufficient columns for extra column definition", + HapFile.MANDATORY_DEFINITION_COLUMNS, + addition.count, + addition) + self.warnskip(addition) + return - self.read_into_header(header_lines) - self.read_into_matrix(data_lines) + ptp = self.retrieve_column_addition_data_type(addition) + if ptp == object: + self.warnskip(addition) + continue - # - # Header - # + self.vars_ex[tp].update({addition[1] : ptp}) + self.types_ex[tp].append(ptp) - def read_into_header(self, values : list[Line]) -> None: - for line in values: - self.parse_meta_line(line) - line.display_flaws_if_present() + def retrieve_column_addition_data_type(self, addition : Line) -> type: + tp = addition[2] + if tp == "d": + return int - def parse_meta_line(self, line : Line): - columns : Columns = line.as_columns() + if search(r"\.\d+f", addition.content) != None: + return float - if len(columns.get(0)) < 2: - self.parse_comment_or_meta(line) + if tp == "s": + return str - else: - self.parse_column_addition(line) + self.lwexfl("Could not parse type for column addition", + "One of: 'd', 's', '.xf' (where 'x' is an integer)", + f"{addition[2]}", + addition) + + return object + # - # C1 Is Just "#" + # Minimum Requirements # - def parse_comment_or_meta(self, line : Line) -> None: - columns : Columns = line.as_columns() + def validate_columns_fulfill_minreqs(self): + self.validate_haplotypes() + self.validate_repeats() + self.validate_variants() - if columns.count < 2: return - metatype_column = columns.get(1) - if metatype_column == HapFile.KEYWORD_VERSION: - self.parse_version(line) + def validate_haplotypes(self): + for line in self.data[HapFile.KEY_HAPLOTYPE]: + has_min_cols = self.check_has_min_cols(line, + HapFile.MANDATORY_HAPLOTYPE_COLUMN_COUNT) - elif re.search(HapFile.KEYWORD_ORDER_REGEX, metatype_column): - self.set_column_order(line) + self.check_start_and_end_positions(line) + if not has_min_cols: + self.lwexfl("Cannot check for variant references: Insufficient columns", + "A mandatory 5 columns for haplotyes", + line.count, + line) + self.warnskip(line) - # - # Version Key Present - # + return + variant_refs = self.vrids.get(line[4]) - def parse_version(self, line: Line) -> None: - columns = line.as_columns() + if variant_refs == None: + self.lwexfl(f"Haplotype ID '{line[4]}' is not associated to any variants", + f"A variant association for Haplotype ID '{line[4]}'", + "No association", + line) + return - s = columns.assert_length_eq(HapFile.MANDATORY_COLUMNS_VERSION_DEF) - if not s: - line.err() - error_expected(f"{HapFile.MANDATORY_COLUMNS_VERSION_DEF} columns for version definition", str(columns.count), line.number) - error_expecting_cols(["Hash (#)", "version", ""]) - self.skip_due_to_errors_emmited(line) - return - version = line.as_columns().get(2) - if (re.search(HapFile.KEYWORD_VERSION_REGEX, version)) == None: - error_expected("a version whose format conforms to \"x.x.x\" where \"x\" is an integer", f"\"{version}\"", line.number) - line.warn() + def validate_repeats(self): + for line in self.data[HapFile.KEY_REPEAT]: + self.check_has_min_cols(line, + HapFile.MANDATORY_REPEAT_COLUMN_COUNT) - self.version = version + self.check_start_and_end_positions(line) - # - # orderX present - # + def validate_variants(self): + for line in self.data[HapFile.KEY_VARIANT]: + self.check_has_min_cols(line, + HapFile.MANDATORY_VARIANT_COLUMN_COUNT) + + self.check_start_and_end_positions(line) + self.check_variant_alleles(line) + + + def check_has_min_cols(self, line : Line, min : int) -> bool: + if line.count < min: + self.lwexfl("Invalid amount of mandatory columns in definition.", + f"At least {min}", + line.count, + line) + return False - def set_column_order(self, line : Line) -> None: - columns : Columns = line.as_columns() + return True - order_x = columns.get(1) - hapfile_type_str = order_x[5:] - tp = HapFile.CHARACTER_TYPE_ASSOCIATIONS.get(hapfile_type_str) + def check_start_and_end_positions(self, line : Line): + if line.count < 3: + self.lefl("Cannot validate start and end positions: Insufficient columns", + line) + self.warnskip(line) + return - if (tp == None): - error_expected(f"one of {list(HapFile.CHARACTER_TYPE_ASSOCIATIONS.keys())} as the type for an order definition", f"\"{hapfile_type_str}\"", line.number) + f = False + + if not line[2].isdigit(): + self.leexfl("Cannot convert start position to integer", + "Integer values for the start position", + line[2], + line) + f = True + + if not line[3].isdigit(): + self.leexfl("Cannot convert end position to integer", + "Integer values for the end position", + line[3], + line) + f = True + + if f: + self.lwfl("Cannot test for correct position order due to previous errors (Inconvertible integers)", + line) + self.warnskip(line) + return - print("CALL set_column_order ") + start = int(line[2]) + end = int(line[3]) + + if start > end: + self.lwexfl("Start position is greater than the end position", + f"Start to be positioned at or before the end", + f"{start} > {end} | Difference of {start - end}", + line) + + if line.count < 5: + self.lwexfl("Cannot perform position validations against variant definitions: Insufficient columns.", + 5, + line.count, + line) + self.warnskip(line) + return + + variant_refs = self.vrids.get(line[4]) + + if variant_refs == None: + return + + for id, ln in variant_refs.items(): + if not ln[2].isdigit(): + self.lwexfl("Variant start position cannot be converted to an integer.", + "An integer", + ln[2], + ln) + self.warnskip(line) + return + + if not ln[3].isdigit(): + self.lwexfl("Variant end position cannot be converted to an integer.", + "An integer", + ln[3], + ln) + self.warnskip(line) + return + + vstart = int(ln[2]) + vend = int(ln[3]) + + if vstart < start: + self.lwexfl("Variant start position cannot be prior to the start position of its haplotype.", + "The variant to start after or when the haplotype does", + f"[Variant] {vstart} < [Haplotype] {start} | Difference of {start - vstart}", + line) + self.logger.warn(f"At Line #{ln.number}: {ln}") + + if vend > end: + self.lwexfl("Variant end position cannot be after than the end position of its haplotype.", + "The variant to end before or when the haplotype does", + f"[Variant] {vend} > [Haplotype] {end} | Difference of {vend - end}", + line) + self.logger.warn(f"At Line #{ln.number}: {ln}") + + + def check_variant_alleles(self, line : Line): + if line.count < HapFile.MANDATORY_VARIANT_COLUMN_COUNT: + self.lwexfl("Cannot test for variant allele type: Not enough columns.", + HapFile.MANDATORY_VARIANT_COLUMN_COUNT, + line.count, + line) + self.warnskip(line) + return + + if line[5].upper() not in ["A", "C", "G", "T"]: + self.lwexfl("Invalid allele type in variant.", + "One of 'A', 'C', 'G', 'T'", + f"'{line[5]}'", + line) # - # C1 is #[...] + # ID Storage # - def parse_column_addition(self, line: Line) -> None: - columns : Columns = line.as_columns() + def store_ids(self): + for tp in range(2): + for line in self.data[tp]: + self.store_hrid(tp, line) - success = columns.assert_length_gte(HapFile.MANDATORY_COLUMNS_FIELD_DEF) - if not success: - line.err() - error_expected(f"{HapFile.MANDATORY_COLUMNS_FIELD_DEF} columns for extra field definition", str(columns.count), line.number) - error_expecting_cols(["Hash & Type (#X)", "Name", "Data Type Format (s, d, .xf)", "Optional Description"]) - self.skip_due_to_errors_emmited(line) - return + for line in self.data[HapFile.KEY_VARIANT]: + self.store_variant_id(line) - hapfile_type_str = columns.get(0)[1:] - hapfile_type = HapFile.get_associated_hapfile_type_from_str(hapfile_type_str) - if (hapfile_type == None): - line.warn() - error_expected(f"one of {list(HapFile.CHARACTER_TYPE_ASSOCIATIONS.keys())} for the line type when adding an extra field", f"\"{hapfile_type_str}\"", line.number) + def store_hrid(self, tp : int, line : Line): + should_skip = False + if line.count < 2: + self.lwexfl("Cannot extract chromosome ID: Insufficient columns.", + "At least 1 column", + line.count, + line) + should_skip = True - python_type = HapFile.get_associated_pytype_from_str(columns.get(2)) + if line.count < 5: + self.lwexfl("Cannot extract ID: Insufficient columns.", + f"At least 5 for ID extraction", + line.count, + line) + should_skip = True + + if should_skip: + self.warnskip(line) + return - if python_type == None: - line.warn() - error_expected(f"one of {list(HapFile.CHARACTER_PYTYPE_ASSOCIATIONS.keys())} for the data type when adding an extra field", f"\"{columns.get(2)}\"", line.number) + if line[4] in self.hrids[tp]: + self.lwexfl("Duplicate ID.", + "A unique ID", + f"'{line[4]}'", + line) + self.logger.warn(f"Originally defined at: line #{self.hrids[tp][line[4]].number}") - if (line.is_flawed()): - self.skip_due_to_errors_emmited(line) + self.warnskip(line) return - assert hapfile_type != None - assert python_type != None + self.hrids[tp].update({line[4] : line}) + + + def store_variant_id(self, line : Line): + if line.count < 5: + self.lwexfl("Cannot extract ID: Insufficient columns.", + f"At least 5 for ID extraction", + line.count, + line) + + if not line[1] in self.vrids.keys(): + self.vrids.update({line[1] : {}}) + + if line[4] in self.vrids[line[1]].keys(): + self.lwexfl("Duplicate variant in for a same haplotype ID.", + "A unique ID per haplotype", + f"'{line[4]}'", + line) + self.logger.warn(f"Originally defined at: line #{self.vrids[line[1]][line[4]].number}") - self.header[hapfile_type].append((columns.get(1), python_type)) + self.warnskip(line) + return + + self.vrids[line[1]].update({line[4] : line}) # - # Matrix + # Variant Validation # - - def read_into_matrix(self, values : list[Line]) -> None: - for line in values: - self.parse_data_line(line) - line.display_flaws_if_present() + def validate_variants_against_haplotypes(self): + self.validate_variant_ids() - def parse_data_line(self, line : Line) -> None: - columns = line.as_columns() - hapfile_type_str : str | None = columns.get(0) - hapfile_type = self.get_associated_hapfile_type_from_str(hapfile_type_str) + def validate_variant_ids(self): + for haplotype, ids in self.vrids.items(): + no_haplotype = False + for id, line in ids.items(): + if haplotype not in self.hrids[HapFile.KEY_HAPLOTYPE].keys(): + self.lefl(f"Cannot link variant '{id}' to non-exisent haplotype '{haplotype}'", line) + no_haplotype = True + continue - if hapfile_type == None: - line.err() - error_expected(f"one of {list(HapFile.CHARACTER_TYPE_ASSOCIATIONS.keys())} when defining data", hapfile_type_str, line.number) - return + if no_haplotype: + self.logger.warn(f"{TRAIL} Define haplotype '{haplotype}' or fix the variant haplotype reference") - self.store_line_into_matrix(hapfile_type, line) - self.validate_line_in_matrix(hapfile_type, line) - def store_line_into_matrix(self, hftp : int, line : Line): - columns : Columns = line.as_columns() - type_header : list[tuple[str, type]] = self.header[hftp] - if (columns.count != len(type_header)): - line.warn() - hftpci : int = list(HapFile.CHARACTER_TYPE_ASSOCIATIONS.values()).index(hftp) - hftpc : str = list(HapFile.CHARACTER_TYPE_ASSOCIATIONS.keys())[hftpci] + # + # Logging + # - error_expected(f"{len(type_header)} columns for \"{hftpc}\" entry", str(columns.count), line.number) + def lefl(self, msg : str, line : Line, sep : str = "\n"): + self.logger.error(f"{TRAIL} {msg}{sep}At line #{line.number}: {line}") - matrix = self.tensor[hftp] - matrix.append(columns.content) + def lwfl(self, msg : str, line : Line, sep : str = "\n"): + self.logger.warn(f"{TRAIL} {msg}{sep}At line #{line.number}: {line}") - def validate_line_in_matrix(self, hftp : int, line : Line): - columns : Columns = line.as_columns() - type_header : list[tuple[str, type]] = self.header[hftp] - if line.is_flawed(): - self.skip_due_to_errors_emmited(line) - return + def lwexfl(self, msg : str, exp : object, rec : object, line : Line, sep : str = "\n"): + self.logger.warning( + f"{TRAIL} {msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}") - for i, col in enumerate(type_header): - entry = columns.content[i] + def leexfl(self, msg : str, exp : object, rec : object, line : Line, sep : str = "\n"): + self.logger.error( + f"{TRAIL} {msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}") - if not is_convertible_to_type(col[1], entry): - error_expected(f"a(n) {str(col[1])[7:-1]} for column \"{col[0]}\"", entry, line.number) - line.warn() + def warnskip(self, line : Line): + self.logger.warning(f"Skipping line #{line.number}") - # - # Extra - # +def is_hapfile_valid(filename : str, sorted = True) -> bool: + file = HapFileIO(filename) + is_readable = file.validate_existence() + if not is_readable: + return False + + hapfile = HapFile() + hapfile.extract_and_store_content(file, sorted = sorted) + + hapfile.store_ids() + + hapfile.validate_column_additions() - def skip_due_to_errors_emmited(self, line : Line) -> None: - print(f">>>>> Skipping line #{line.number}.", file = stderr) + hapfile.validate_columns_fulfill_minreqs() + hapfile.validate_variant_ids() + + hapfile.validate_version_declarations() + + + + return True From 79698fc8496ef2909bad470ea9f2de9c6a044057 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Thu, 20 Jul 2023 11:34:13 -0600 Subject: [PATCH 03/44] Raise error on type IDs which match chromosome IDs --- haptools/val_hapfile.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index 2308934b..72b38a89 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -11,6 +11,7 @@ def tmpex(expectation : object, received : object) -> str: return f"Expected: {expectation}\nReceived: {received}" + class Line: @@ -497,6 +498,8 @@ def store_hrid(self, tp : int, line : Line): self.warnskip(line) return + self.referenced_chromosomes.add(line[1]) + if line[4] in self.hrids[tp]: self.lwexfl("Duplicate ID.", "A unique ID", @@ -507,6 +510,13 @@ def store_hrid(self, tp : int, line : Line): self.warnskip(line) return + if line[4] in self.referenced_chromosomes: + self.lefl(f"ID '{line[4]}' is already registered as a chromosome.", + line) + self.warnskip(line) + return + + self.hrids[tp].update({line[4] : line}) @@ -530,6 +540,12 @@ def store_variant_id(self, line : Line): self.warnskip(line) return + if line[4] in self.referenced_chromosomes: + self.lefl(f"ID '{line[4]}' is already registered as a chromosome.", + line) + self.warnskip(line) + return + self.vrids[line[1]].update({line[4] : line}) From ffc7935958f28d79be481d78b327c4bdcc6b37e3 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Thu, 20 Jul 2023 11:53:48 -0600 Subject: [PATCH 04/44] Report errors for column additions on non-existent types --- haptools/val_hapfile.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index 72b38a89..0968a1f6 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -254,6 +254,15 @@ def find_column_additions(self) -> list[Line]: lambda line : search(r"#[H|R|V]", line[0]) != None, self.meta_lines)) + invalid_lines = [x for x in self.meta_lines if x not in additions and len(x[0]) > 1] + + for ln in invalid_lines: + self.lwexfl("Invalid column addition type.", + "A column addition for 'H', 'R', or 'V'", + f"A column addition for '{ln[0][1]}', whose type doesn't exist", + ln) + + return additions From beabc9825307539e58d84e58cb6be7a6852a9ad9 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Thu, 20 Jul 2023 16:49:05 -0600 Subject: [PATCH 05/44] Recognize extra columns and cast validation for extra column types --- haptools/val_hapfile.py | 47 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index 0968a1f6..b0794386 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -580,7 +580,49 @@ def validate_variant_ids(self): self.logger.warn(f"{TRAIL} Define haplotype '{haplotype}' or fix the variant haplotype reference") + # + # Extra field validation + # + + + def validate_extra_fields(self): + for tp in range(HapFile.KEY_HAPLOTYPE, HapFile.KEY_VARIANT + 1): + excol_count = len(self.types_ex[tp]) + lines = self.data[tp] + + for line in lines: + rs = (5 if tp != HapFile.KEY_VARIANT else 6) + extras = line.count - rs + if extras != excol_count: + self.lwexfl("Invalid amount of extra columns in line.", + excol_count, + extras, + line) + + if extras < 0: + self.lefl("There aren't even enough mandatory columns", line) + + self.warnskip(line) + continue + + for ptp, col in zip(self.types_ex[tp], line.columns[rs:]): + conv = self.determine_if_is_convertible(col, ptp) + if not conv: + self.leexfl("Value in extra column is not convertible to the associated type", + f"A value that can be converted to a(n) {str(ptp)[8:-2]}", + col, + line) + + + def determine_if_is_convertible(self, what : str, tp : type): + if tp == int: + return what.isdigit() + + if tp == float: + return what.isnumeric() + + return tp == str # @@ -625,10 +667,9 @@ def is_hapfile_valid(filename : str, sorted = True) -> bool: hapfile.validate_columns_fulfill_minreqs() hapfile.validate_variant_ids() - hapfile.validate_version_declarations() - + hapfile.validate_extra_fields() - + hapfile.validate_version_declarations() return True From 6b45589cb8eeee0d7789566d46ed18e178965e43 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Thu, 20 Jul 2023 17:07:21 -0600 Subject: [PATCH 06/44] Corrected bug where float values were unrecognized --- haptools/val_hapfile.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index b0794386..59a963ab 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -615,15 +615,27 @@ def validate_extra_fields(self): line) - def determine_if_is_convertible(self, what : str, tp : type): + def determine_if_is_convertible(self, what : str, tp : type) -> bool: if tp == int: return what.isdigit() if tp == float: - return what.isnumeric() + return search(r"\d*\.?\d+$", what) != None return tp == str + # + # Extra field reordering + # + + def reorder_extra_fields(self): + reordering_metalns = list(filter( + lambda line : line.count > 1 and search("order[H|R|V]", line[1]) != None, + self.meta_lines)) + + for line in reordering_metalns: + print(line.content) + # # Logging From 2233b095190f0d1948d1240e931f2ae57823d180 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Fri, 21 Jul 2023 12:59:43 -0600 Subject: [PATCH 07/44] Allow parsing & reordering of extra columns --- haptools/val_hapfile.py | 50 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index 59a963ab..8f15b353 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -200,17 +200,16 @@ def validate_version_declarations(self): versions = self.extract_version_declarations() if len(versions) == 0: self.logger.warn(f"{TRAIL} No version declaration found. Assuming to use the latest version.") + return - for version in versions: - self.validate_version_format(version) + self.validate_version_format(versions[-1]) def extract_version_declarations(self) -> list[Line]: decls = list(filter(lambda x : x.count > 1 and x[1] == "version", self.meta_lines)) if len(decls) > 1: - self.logger.warn(f"{TRAIL} Found more than one " - "version declaration.") + self.logger.warn(f"{TRAIL} Found more than one version declaration. Using the last instance.") for decl in decls: self.lwfl("", decl, sep = "") @@ -633,8 +632,45 @@ def reorder_extra_fields(self): lambda line : line.count > 1 and search("order[H|R|V]", line[1]) != None, self.meta_lines)) - for line in reordering_metalns: - print(line.content) + for i, c in enumerate(['H', 'R', 'V']): + relevant = list(filter(lambda line : line[1][5] == c, reordering_metalns)) + + if len(relevant) == 0: + continue + + if len(relevant) > 1: + self.logger.warn(f"Found multiple order{c} definition lines. Using the last available one.") + + ln = relevant[-1] + + self.reorder_field_types(i, ln) + + + def reorder_field_types(self, tp : int, line : Line): + extpc = len(self.vars_ex[tp].keys()) + exclc = line.count - 2 + + if (extpc != exclc): + self.leexfl("Not enough columns in extra column reordering", + extpc, + exclc, + line) + self.warnskip(line) + return + + s = False + for col in line.columns[2:]: + if not col in self.vars_ex[tp]: + self.lefl(f"{col} has not been defined as an extra colunm", line) + s = True + + if s: + self.warnskip(line) + return + + self.types_ex[tp].clear() + for col in line.columns[2:]: + self.types_ex[tp].append(self.vars_ex[tp][col]) # @@ -679,6 +715,8 @@ def is_hapfile_valid(filename : str, sorted = True) -> bool: hapfile.validate_columns_fulfill_minreqs() hapfile.validate_variant_ids() + hapfile.reorder_extra_fields() + hapfile.validate_extra_fields() hapfile.validate_version_declarations() From 0cb658611529d8780add0e001d01a964621bc097 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Sun, 23 Jul 2023 20:11:14 -0600 Subject: [PATCH 08/44] Append feature to cli Missing cleanup + checking against genotype file --- haptools/__main__.py | 43 +++ haptools/val_hapfile.py | 732 +++++++++++++++++++++++----------------- 2 files changed, 474 insertions(+), 301 deletions(-) diff --git a/haptools/__main__.py b/haptools/__main__.py index 39cf3322..5d683e68 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from __future__ import annotations +from enum import Flag import sys from pathlib import Path @@ -1025,6 +1026,48 @@ def clump( ) +@main.command(short_help="Validate the structure of a .hap file") +@click.argument("filename", type=click.Path(exists=True, path_type=Path)) +@click.option( + "--sort/--no-sort", + is_flag=True, + default=True, + show_default=True, + help="Sorting of the file will not be performed", +) +@click.option( + "--pvar", + type=click.Path(path_type=Path), + default=None, + show_default="input file", + help="A .hap file containing sorted and indexed haplotypes and variants", +) +@click.option( + "-v", + "--verbosity", + type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"]), + default="INFO", + show_default=True, + help="The level of verbosity desired", +) +def validate_hapfile( + filename: Path, + sort: bool, + pvar: Path, + verbosity: str = "DEBUG", +): + + from haptools import val_hapfile + from .logging import getLogger + + log = getLogger(name="validate-hapfile", level = verbosity) + + is_valid = val_hapfile.is_hapfile_valid(filename, sorted=sort, logger=log) + + if not is_valid: + log.warn("Found several warnings and / or errors in the hapfile") + + if __name__ == "__main__": # run the CLI if someone tries 'python -m haptools' on the command line main(prog_name="haptools") diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index 8f15b353..c366320e 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -1,278 +1,308 @@ +from __future__ import annotations + import os -from logging import Logger, getLogger +from haptools import logging + from re import search +from pathlib import Path + +from haptools import logging -VALHAP_LOGGER_NAME = "Hapfile Validation" +LOGGER_NAME = "valhap" LTS_SPEC = "0.2.0" TRAIL = "\n>>>" -def tmpex(expectation : object, received : object) -> str: +def tmpex(expectation: object, received: object) -> str: return f"Expected: {expectation}\nReceived: {received}" class Line: + def __init__(self, content: str, number: int): + self.content: str = content + self.number: int = number - - def __init__(self, content : str, number : int): - self.content : str = content - self.number : int = number - - self.columns : list[str] = content.split() - self.count : int = len(self.columns) - + self.columns: list[str] = content.split() + self.count: int = len(self.columns) def is_empty(self) -> bool: return self.count == 0 - - def __getitem__(self, index : int) -> str: + def __getitem__(self, index: int) -> str: return self.columns[index] - def __str__(self) -> str: return self.content class HapFileIO: - - - def __init__(self, filename : str, logger : Logger = getLogger(VALHAP_LOGGER_NAME)): + def __init__(self, filename: Path, logger=None): self.filename = filename - self.logger = logger + self.log = logger or logging.getLogger(LOGGER_NAME) + def lines(self, sorted: bool = True) -> list[Line]: + buffer = open(self.filename) - def lines(self, sorted : bool = True) -> list[Line]: - buffer = open(self.filename) - - content = [Line(line.strip(), i + 1) for i, line in enumerate(buffer.readlines())] - content = list(filter(lambda line : not line.is_empty(), content)) + content = [ + Line(line.strip(), i + 1) for i, line in enumerate(buffer.readlines()) + ] + content = list(filter(lambda line: not line.is_empty(), content)) buffer.close() if not sorted: - meta_limit = next(idx for idx, line in enumerate(content) if not line[0].startswith('#')) - content = [line for idx, line in enumerate(content) if (not line[0].startswith('#')) or idx < meta_limit] + meta_limit = next( + idx for idx, line in enumerate(content) if not line[0].startswith("#") + ) + content = [ + line + for idx, line in enumerate(content) + if (not line[0].startswith("#")) or idx < meta_limit + ] # lol - content.sort(key = lambda line : ord(line[0][0])) + content.sort(key=lambda line: ord(line[0][0])) return content - def validate_existence(self) -> bool: if not self.exists(): - self.logger.error(f"The file {self.filename} does not exist.") + self.log.error(f"The file {self.filename} does not exist.") return False is_ok = True if not self.is_regular(): - self.logger.error(f"Cannot read {self.filename}: Is not a regular file.") + self.log.error(f"Cannot read {self.filename}: Is not a regular file.") is_ok = False if not self.is_readable(): - self.logger.error(f"Cannot read {self.filename}: Insufficient permissions.") + self.log.error(f"Cannot read {self.filename}: Insufficient permissions.") is_ok = False return is_ok - def exists(self) -> bool: return os.path.exists(self.filename) - def is_regular(self): return os.path.isfile(self.filename) - def is_readable(self) -> bool: return os.access(self.filename, os.R_OK) -class HapFile: - +class HapFileValidator: + errc : int = 0 + warc : int = 0 # H CHROM START END ID - MANDATORY_HAPLOTYPE_COLUMN_COUNT : int = 5 + MANDATORY_HAPLOTYPE_COLUMN_COUNT: int = 5 # R CHROM START END ID LN - MANDATORY_REPEAT_COLUMN_COUNT : int = 5 + MANDATORY_REPEAT_COLUMN_COUNT: int = 5 # V CHROM START END ID CHROM LN - MANDATORY_VARIANT_COLUMN_COUNT : int = 6 + MANDATORY_VARIANT_COLUMN_COUNT: int = 6 # # version - MANDATORY_VERSION_COLUMNS : int = 3 + MANDATORY_VERSION_COLUMNS: int = 3 # #X Name Type [Description] MANDATORY_DEFINITION_COLUMNS = 3 - KEY_HAPLOTYPE : int = 0 - KEY_REPEAT : int = 1 - KEY_VARIANT : int = 2 - KEY_VARIANT_SRC : int = 9 + KEY_HAPLOTYPE: int = 0 + KEY_REPEAT: int = 1 + KEY_VARIANT: int = 2 + KEY_VARIANT_SRC: int = 9 NAME_HAPLOTYPE = "Haplotype" - NAME_REPEAT = "Repeat" - NAME_VARIANT = "Variant" - - KEY_KEY : str = "HT::Key" - KEY_CHROMOSOME : str = "HT::Chromosome" - KEY_START : str = "HT::Start" - KEY_END : str = "HT::End" - KEY_ID : str = "HT::ID" - KEY_ALLELE : str = "HT::Allele" - - DEFAULT_HEADER : dict[int, dict[str, type]] = { - KEY_HAPLOTYPE : {}, - KEY_REPEAT : {}, - KEY_VARIANT : {} - } - - EMPTY_TYPES : dict[int, list[type]] = { - KEY_HAPLOTYPE : [], - KEY_REPEAT : [], - KEY_VARIANT : [] + NAME_REPEAT = "Repeat" + NAME_VARIANT = "Variant" + + KEY_KEY: str = "HT::Key" + KEY_CHROMOSOME: str = "HT::Chromosome" + KEY_START: str = "HT::Start" + KEY_END: str = "HT::End" + KEY_ID: str = "HT::ID" + KEY_ALLELE: str = "HT::Allele" + + #DEFAULT_HEADER: dict[int, dict[str, type]] = { + DEFAULT_HEADER: dict = { + KEY_HAPLOTYPE: {}, + KEY_REPEAT: {}, + KEY_VARIANT: {}, } - EMPTY_DATA : dict[int, list[Line]] = { - KEY_HAPLOTYPE : [], - KEY_REPEAT : [], - KEY_VARIANT : [] + #EMPTY_TYPES: dict[int, list[type]] = { + EMPTY_TYPES: dict = { + KEY_HAPLOTYPE: [], + KEY_REPEAT: [], + KEY_VARIANT: [], } - EMPTY_HRIDS : dict[int, dict[str, Line]] = { - KEY_HAPLOTYPE : {}, - KEY_REPEAT : {}, + #EMPTY_DATA: dict[int, list[Line]] = { + EMPTY_DATA: dict = { + KEY_HAPLOTYPE: [], + KEY_REPEAT: [], + KEY_VARIANT: [], } - EMPTY_VRIDS : dict[str, dict[str, Line]] = { + #EMPTY_HRIDS: dict[int, dict[str, Line]] = { + EMPTY_HRIDS: dict = { + KEY_HAPLOTYPE: {}, + KEY_REPEAT: {}, } - EMPTY_META : list[Line] = [] + #EMPTY_VRIDS: dict[str, dict[str, Line]] = {} + EMPTY_VRIDS: dict = {} + #EMPTY_META: list[Line] = [] + EMPTY_META: list = [] - def __init__(self, logger : Logger = getLogger(VALHAP_LOGGER_NAME)): - self.logger : Logger = logger + def __init__(self, logger=None): + self.log = logger or logging.getLogger(LOGGER_NAME) - self.vars_ex : dict[int, dict[str, type]] = HapFile.DEFAULT_HEADER - self.types_ex : dict[int, list[type]] = HapFile.EMPTY_TYPES + self.vars_ex: dict[int, dict[str, type]] = HapFileValidator.DEFAULT_HEADER + self.types_ex: dict[int, list[type]] = HapFileValidator.EMPTY_TYPES - self.meta : list[Line] = HapFile.EMPTY_META - self.data : dict[int, list[Line]] = HapFile.EMPTY_DATA + self.meta: list[Line] = HapFileValidator.EMPTY_META + self.data: dict[int, list[Line]] = HapFileValidator.EMPTY_DATA - self.hrids : dict[int, dict[str, Line]] = HapFile.EMPTY_HRIDS - self.vrids : dict[str, dict[str, Line]] = HapFile.EMPTY_VRIDS + self.hrids: dict[int, dict[str, Line]] = HapFileValidator.EMPTY_HRIDS + self.vrids: dict[str, dict[str, Line]] = HapFileValidator.EMPTY_VRIDS - self.referenced_chromosomes : set[str] = set() + self.referenced_chromosomes: set[str] = set() - - def extract_and_store_content(self, file : HapFileIO, sorted : bool = True): - lines = file.lines(sorted = sorted) + def extract_and_store_content(self, file: HapFileIO, sorted: bool = True): + lines = file.lines(sorted=sorted) self.extract_meta_lines(lines) self.extract_data_lines(lines) - - def extract_meta_lines(self, lines : list[Line]): - header_limit = next(i for i, line in enumerate(lines) if not line[0].startswith('#')) + def extract_meta_lines(self, lines: list[Line]): + header_limit = next( + i for i, line in enumerate(lines) if not line[0].startswith("#") + ) self.meta_lines = lines[:header_limit] - - def extract_data_lines(self, lines : list[Line]): + def extract_data_lines(self, lines: list[Line]): limits = [0, 0, 0] - for i, char in enumerate(['H', 'R', 'V']): - limits[i] = next(i for i, line in enumerate(lines) if line[0].startswith(char)) - - ln = [lines[limits[0] : limits[1]], lines[limits[1] : limits[2]], lines[limits[2] : ]] - - for i in range(HapFile.KEY_HAPLOTYPE, HapFile.KEY_VARIANT + 1): + for i, char in enumerate(["H", "R", "V"]): + limits[i] = next( + i for i, line in enumerate(lines) if line[0].startswith(char) + ) + + ln = [ + lines[limits[0] : limits[1]], + lines[limits[1] : limits[2]], + lines[limits[2] :], + ] + + for i in range(HapFileValidator.KEY_HAPLOTYPE, HapFileValidator.KEY_VARIANT + 1): self.data[i] = ln[i] - # # Version Validation # - + def validate_version_declarations(self): versions = self.extract_version_declarations() if len(versions) == 0: - self.logger.warn(f"{TRAIL} No version declaration found. Assuming to use the latest version.") + self.log.warn( + f"{TRAIL} No version declaration found. Assuming to use the latest" + " version." + ) + HapFileValidator.warc += 1 return self.validate_version_format(versions[-1]) - def extract_version_declarations(self) -> list[Line]: - decls = list(filter(lambda x : x.count > 1 and x[1] == "version", self.meta_lines)) + decls = list( + filter(lambda x: x.count > 1 and x[1] == "version", self.meta_lines) + ) if len(decls) > 1: - self.logger.warn(f"{TRAIL} Found more than one version declaration. Using the last instance.") + self.log.warn( + f"{TRAIL} Found more than one version declaration. Using the last" + " instance. Each is its own warning." + ) for decl in decls: - self.lwfl("", decl, sep = "") + HapFileValidator.warc += 1 + self.lwfl("", decl, sep="") return decls - - def validate_version_format(self, version : Line): + def validate_version_format(self, version: Line): if version.count < 3: - self.leexfl("Not enough columns in version declaration", - HapFile.MANDATORY_DEFINITION_COLUMNS, - version.count, - version) - self.logger.warning(f"Skipping line #{version.number}") - + self.leexfl( + "Not enough columns in version declaration", + HapFileValidator.MANDATORY_DEFINITION_COLUMNS, + version.count, + version, + ) + self.warnskip(version) + + HapFileValidator.errc += 1 return if search(r"\d+\.\d+\.\d+", version[2]) == None: - self.lwexfl("Version is incorrectly formatted", - "'x.x.x' where 'x' is an integer", - version[2], - version) + self.leexfl( + "Version is incorrectly formatted", + "'x.x.x' where 'x' is an integer", + version[2], + version, + ) + HapFileValidator.errc += 1 # # Column additions # - def validate_column_additions(self): additions = self.find_column_additions() for i, k in enumerate(["#H", "#R", "#V"]): - self.add_column_additions_to_header(i, list(filter( - lambda line: line[0] == k, - additions))) - + self.add_column_additions_to_header( + i, list(filter(lambda line: line[0] == k, additions)) + ) def find_column_additions(self) -> list[Line]: - additions = list(filter( - lambda line : search(r"#[H|R|V]", line[0]) != None, - self.meta_lines)) + additions = list( + filter(lambda line: search(r"#[H|R|V]", line[0]) != None, self.meta_lines) + ) - invalid_lines = [x for x in self.meta_lines if x not in additions and len(x[0]) > 1] + invalid_lines = [ + x for x in self.meta_lines if x not in additions and len(x[0]) > 1 + ] for ln in invalid_lines: - self.lwexfl("Invalid column addition type.", - "A column addition for 'H', 'R', or 'V'", - f"A column addition for '{ln[0][1]}', whose type doesn't exist", - ln) - + self.leexfl( + "Invalid column addition type.", + "A column addition for 'H', 'R', or 'V'", + f"A column addition for '{ln[0][1]}', whose type doesn't exist", + ln, + ) + HapFileValidator.errc += 1 return additions - - def add_column_additions_to_header(self, tp : int, additions : list[Line]): + def add_column_additions_to_header(self, tp: int, additions: list[Line]): for addition in additions: if addition.count < 3: - self.lwexfl("Insufficient columns for extra column definition", - HapFile.MANDATORY_DEFINITION_COLUMNS, - addition.count, - addition) + self.leexfl( + "Insufficient columns for extra column definition", + HapFileValidator.MANDATORY_DEFINITION_COLUMNS, + addition.count, + addition, + ) self.warnskip(addition) + + HapFileValidator.errc += 1 return ptp = self.retrieve_column_addition_data_type(addition) @@ -281,11 +311,10 @@ def add_column_additions_to_header(self, tp : int, additions : list[Line]): self.warnskip(addition) continue - self.vars_ex[tp].update({addition[1] : ptp}) + self.vars_ex[tp].update({addition[1]: ptp}) self.types_ex[tp].append(ptp) - - def retrieve_column_addition_data_type(self, addition : Line) -> type: + def retrieve_column_addition_data_type(self, addition: Line) -> type: tp = addition[2] if tp == "d": @@ -297,125 +326,156 @@ def retrieve_column_addition_data_type(self, addition : Line) -> type: if tp == "s": return str - self.lwexfl("Could not parse type for column addition", - "One of: 'd', 's', '.xf' (where 'x' is an integer)", - f"{addition[2]}", - addition) - + self.leexfl( + "Could not parse type for column addition", + "One of: 'd', 's', '.xf' (where 'x' is an integer)", + f"{addition[2]}", + addition, + ) + + HapFileValidator.errc += 1 return object - # # Minimum Requirements # - def validate_columns_fulfill_minreqs(self): self.validate_haplotypes() self.validate_repeats() self.validate_variants() - def validate_haplotypes(self): - for line in self.data[HapFile.KEY_HAPLOTYPE]: - has_min_cols = self.check_has_min_cols(line, - HapFile.MANDATORY_HAPLOTYPE_COLUMN_COUNT) + for line in self.data[HapFileValidator.KEY_HAPLOTYPE]: + has_min_cols = self.check_has_min_cols( + line, HapFileValidator.MANDATORY_HAPLOTYPE_COLUMN_COUNT + ) self.check_start_and_end_positions(line) if not has_min_cols: - self.lwexfl("Cannot check for variant references: Insufficient columns", - "A mandatory 5 columns for haplotyes", - line.count, - line) + self.lwexfl( + "Cannot check for variant references: Insufficient columns", + "A mandatory 5 columns for haplotyes", + line.count, + line, + ) self.warnskip(line) + HapFileValidator.warc += 1 return variant_refs = self.vrids.get(line[4]) if variant_refs == None: - self.lwexfl(f"Haplotype ID '{line[4]}' is not associated to any variants", - f"A variant association for Haplotype ID '{line[4]}'", - "No association", - line) + self.leexfl( + f"Haplotype ID '{line[4]}' is not associated to any variants", + f"A variant association for Haplotype ID '{line[4]}'", + "No association", + line, + ) + + HapFileValidator.errc += 1 return - def validate_repeats(self): - for line in self.data[HapFile.KEY_REPEAT]: - self.check_has_min_cols(line, - HapFile.MANDATORY_REPEAT_COLUMN_COUNT) + for line in self.data[HapFileValidator.KEY_REPEAT]: + self.check_has_min_cols(line, HapFileValidator.MANDATORY_REPEAT_COLUMN_COUNT) self.check_start_and_end_positions(line) - def validate_variants(self): - for line in self.data[HapFile.KEY_VARIANT]: - self.check_has_min_cols(line, - HapFile.MANDATORY_VARIANT_COLUMN_COUNT) + for line in self.data[HapFileValidator.KEY_VARIANT]: + self.check_has_min_cols(line, HapFileValidator.MANDATORY_VARIANT_COLUMN_COUNT) self.check_start_and_end_positions(line) self.check_variant_alleles(line) - - def check_has_min_cols(self, line : Line, min : int) -> bool: + def check_has_min_cols(self, line: Line, min: int) -> bool: if line.count < min: - self.lwexfl("Invalid amount of mandatory columns in definition.", - f"At least {min}", - line.count, - line) + self.leexfl( + "Invalid amount of mandatory columns in definition.", + f"At least {min}", + line.count, + line, + ) + + HapFileValidator.errc += 1 return False return True - - def check_start_and_end_positions(self, line : Line): + def check_start_and_end_positions(self, line: Line): if line.count < 3: - self.lefl("Cannot validate start and end positions: Insufficient columns", - line) + self.lwfl( + "Cannot validate start and end positions: Insufficient columns", line + ) self.warnskip(line) + + HapFileValidator.warc += 1 return f = False if not line[2].isdigit(): - self.leexfl("Cannot convert start position to integer", - "Integer values for the start position", - line[2], - line) + self.leexfl( + "Cannot convert start position to integer", + "Integer values for the start position", + line[2], + line, + ) + + HapFileValidator.errc += 1 f = True if not line[3].isdigit(): - self.leexfl("Cannot convert end position to integer", - "Integer values for the end position", - line[3], - line) + self.leexfl( + "Cannot convert end position to integer", + "Integer values for the end position", + line[3], + line, + ) + + HapFileValidator.errc += 1 f = True if f: - self.lwfl("Cannot test for correct position order due to previous errors (Inconvertible integers)", - line) + self.lwfl( + "Cannot test for correct position order due to previous errors" + " (Inconvertible integers)", + line, + ) self.warnskip(line) + + HapFileValidator.warc += 1 return start = int(line[2]) - end = int(line[3]) + end = int(line[3]) if start > end: - self.lwexfl("Start position is greater than the end position", - f"Start to be positioned at or before the end", - f"{start} > {end} | Difference of {start - end}", - line) + self.leexfl( + "Start position is greater than the end position", + f"Start to be positioned at or before the end", + f"{start} > {end} | Difference of {start - end}", + line, + ) + + HapFileValidator.errc += 1 if line.count < 5: - self.lwexfl("Cannot perform position validations against variant definitions: Insufficient columns.", + self.lwexfl( + "Cannot perform position validations against variant definitions:" + " Insufficient columns.", 5, line.count, - line) + line, + ) self.warnskip(line) + + HapFileValidator.warc += 1 return - + variant_refs = self.vrids.get(line[4]) if variant_refs == None: @@ -423,83 +483,115 @@ def check_start_and_end_positions(self, line : Line): for id, ln in variant_refs.items(): if not ln[2].isdigit(): - self.lwexfl("Variant start position cannot be converted to an integer.", - "An integer", - ln[2], - ln) + self.leexfl( + "Variant start position cannot be converted to an integer.", + "An integer", + ln[2], + ln, + ) self.warnskip(line) + + HapFileValidator.errc += 1 return if not ln[3].isdigit(): - self.lwexfl("Variant end position cannot be converted to an integer.", - "An integer", - ln[3], - ln) + self.leexfl( + "Variant end position cannot be converted to an integer.", + "An integer", + ln[3], + ln, + ) self.warnskip(line) + + HapFileValidator.errc += 1 return vstart = int(ln[2]) vend = int(ln[3]) if vstart < start: - self.lwexfl("Variant start position cannot be prior to the start position of its haplotype.", + self.leexfl( + "Variant start position cannot be prior to the start position of" + " its haplotype.", "The variant to start after or when the haplotype does", - f"[Variant] {vstart} < [Haplotype] {start} | Difference of {start - vstart}", - line) - self.logger.warn(f"At Line #{ln.number}: {ln}") + f"[Variant] {vstart} < [Haplotype] {start} | Difference of" + f" {start - vstart}", + line, + ) + self.log.warn(f"At Line #{ln.number}: {ln}") + + HapFileValidator.errc += 1 if vend > end: - self.lwexfl("Variant end position cannot be after than the end position of its haplotype.", + self.leexfl( + "Variant end position cannot be after than the end position of its" + " haplotype.", "The variant to end before or when the haplotype does", - f"[Variant] {vend} > [Haplotype] {end} | Difference of {vend - end}", - line) - self.logger.warn(f"At Line #{ln.number}: {ln}") - - - def check_variant_alleles(self, line : Line): - if line.count < HapFile.MANDATORY_VARIANT_COLUMN_COUNT: - self.lwexfl("Cannot test for variant allele type: Not enough columns.", - HapFile.MANDATORY_VARIANT_COLUMN_COUNT, - line.count, - line) + f"[Variant] {vend} > [Haplotype] {end} | Difference of" + f" {vend - end}", + line, + ) + self.log.warn(f"At Line #{ln.number}: {ln}") + + HapFileValidator.errc += 1 + + def check_variant_alleles(self, line: Line): + if line.count < HapFileValidator.MANDATORY_VARIANT_COLUMN_COUNT: + self.lwexfl( + "Cannot test for variant allele type: Not enough columns.", + HapFileValidator.MANDATORY_VARIANT_COLUMN_COUNT, + line.count, + line, + ) self.warnskip(line) + + HapFileValidator.warc += 1 return if line[5].upper() not in ["A", "C", "G", "T"]: - self.lwexfl("Invalid allele type in variant.", - "One of 'A', 'C', 'G', 'T'", - f"'{line[5]}'", - line) + self.leexfl( + "Invalid allele type in variant.", + "One of 'A', 'C', 'G', 'T'", + f"'{line[5]}'", + line, + ) + HapFileValidator.errc += 1 # # ID Storage # - def store_ids(self): for tp in range(2): for line in self.data[tp]: self.store_hrid(tp, line) - for line in self.data[HapFile.KEY_VARIANT]: + for line in self.data[HapFileValidator.KEY_VARIANT]: self.store_variant_id(line) - - def store_hrid(self, tp : int, line : Line): + def store_hrid(self, tp: int, line: Line): should_skip = False if line.count < 2: - self.lwexfl("Cannot extract chromosome ID: Insufficient columns.", - "At least 1 column", - line.count, - line) + self.lwexfl( + "Cannot extract chromosome ID: Insufficient columns.", + "At least 1 column", + line.count, + line, + ) + + HapFileValidator.warc += 1 should_skip = True if line.count < 5: - self.lwexfl("Cannot extract ID: Insufficient columns.", + self.lwexfl( + "Cannot extract ID: Insufficient columns.", f"At least 5 for ID extraction", line.count, - line) + line, + ) + + HapFileValidator.warc += 1 should_skip = True if should_skip: @@ -509,112 +601,134 @@ def store_hrid(self, tp : int, line : Line): self.referenced_chromosomes.add(line[1]) if line[4] in self.hrids[tp]: - self.lwexfl("Duplicate ID.", - "A unique ID", - f"'{line[4]}'", - line) - self.logger.warn(f"Originally defined at: line #{self.hrids[tp][line[4]].number}") + self.leexfl("Duplicate ID.", "A unique ID", f"'{line[4]}'", line) + self.log.warn( + f"Originally defined at: line #{self.hrids[tp][line[4]].number}" + ) self.warnskip(line) + + HapFileValidator.errc += 1 return if line[4] in self.referenced_chromosomes: - self.lefl(f"ID '{line[4]}' is already registered as a chromosome.", - line) + self.lwfl(f"ID '{line[4]}' is already registered as a chromosome.", line) self.warnskip(line) - return + HapFileValidator.warc += 1 + return - self.hrids[tp].update({line[4] : line}) - + self.hrids[tp].update({line[4]: line}) - def store_variant_id(self, line : Line): + def store_variant_id(self, line: Line): if line.count < 5: - self.lwexfl("Cannot extract ID: Insufficient columns.", + self.lwexfl( + "Cannot extract ID: Insufficient columns.", f"At least 5 for ID extraction", line.count, - line) + line, + ) + HapFileValidator.warc += 1 if not line[1] in self.vrids.keys(): - self.vrids.update({line[1] : {}}) + self.vrids.update({line[1]: {}}) if line[4] in self.vrids[line[1]].keys(): - self.lwexfl("Duplicate variant in for a same haplotype ID.", - "A unique ID per haplotype", - f"'{line[4]}'", - line) - self.logger.warn(f"Originally defined at: line #{self.vrids[line[1]][line[4]].number}") - + self.leexfl( + "Duplicate variant in for a same haplotype ID.", + "A unique ID per haplotype", + f"'{line[4]}'", + line, + ) + self.log.warn( + f"Originally defined at: line #{self.vrids[line[1]][line[4]].number}" + ) self.warnskip(line) + + HapFileValidator.errc += 1 return if line[4] in self.referenced_chromosomes: - self.lefl(f"ID '{line[4]}' is already registered as a chromosome.", - line) + self.lwfl(f"ID '{line[4]}' is already registered as a chromosome.", line) self.warnskip(line) - return - self.vrids[line[1]].update({line[4] : line}) + HapFileValidator.warc += 1 + return + self.vrids[line[1]].update({line[4]: line}) # # Variant Validation # - def validate_variants_against_haplotypes(self): self.validate_variant_ids() - def validate_variant_ids(self): for haplotype, ids in self.vrids.items(): no_haplotype = False for id, line in ids.items(): - if haplotype not in self.hrids[HapFile.KEY_HAPLOTYPE].keys(): - self.lefl(f"Cannot link variant '{id}' to non-exisent haplotype '{haplotype}'", line) + if haplotype not in self.hrids[HapFileValidator.KEY_HAPLOTYPE].keys(): + self.lefl( + f"Cannot link variant '{id}' to non-exisent haplotype" + f" '{haplotype}'", + line, + ) no_haplotype = True + + HapFileValidator.errc += 1 continue if no_haplotype: - self.logger.warn(f"{TRAIL} Define haplotype '{haplotype}' or fix the variant haplotype reference") - + self.log.warn( + f"{TRAIL} Define haplotype '{haplotype}' or fix the variant" + " haplotype reference" + ) # # Extra field validation # - def validate_extra_fields(self): - for tp in range(HapFile.KEY_HAPLOTYPE, HapFile.KEY_VARIANT + 1): + for tp in range(HapFileValidator.KEY_HAPLOTYPE, HapFileValidator.KEY_VARIANT + 1): excol_count = len(self.types_ex[tp]) lines = self.data[tp] for line in lines: - rs = (5 if tp != HapFile.KEY_VARIANT else 6) + rs = 5 if tp != HapFileValidator.KEY_VARIANT else 6 extras = line.count - rs if extras != excol_count: - self.lwexfl("Invalid amount of extra columns in line.", + self.lwexfl( + "Invalid amount of extra columns in line.", excol_count, extras, - line) + line, + ) if extras < 0: self.lefl("There aren't even enough mandatory columns", line) + HapFileValidator.warc += 1 self.warnskip(line) + + HapFileValidator.warc += 1 continue for ptp, col in zip(self.types_ex[tp], line.columns[rs:]): conv = self.determine_if_is_convertible(col, ptp) if not conv: - self.leexfl("Value in extra column is not convertible to the associated type", - f"A value that can be converted to a(n) {str(ptp)[8:-2]}", - col, - line) + self.leexfl( + "Value in extra column is not convertible to the associated" + " type", + f"A value that can be converted to a(n) {str(ptp)[8:-2]}", + col, + line, + ) + HapFileValidator.errc += 1 - def determine_if_is_convertible(self, what : str, tp : type) -> bool: + def determine_if_is_convertible(self, what: str, tp: type) -> bool: if tp == int: return what.isdigit() @@ -628,40 +742,48 @@ def determine_if_is_convertible(self, what : str, tp : type) -> bool: # def reorder_extra_fields(self): - reordering_metalns = list(filter( - lambda line : line.count > 1 and search("order[H|R|V]", line[1]) != None, - self.meta_lines)) + reordering_metalns = list( + filter( + lambda line: line.count > 1 and search("order[H|R|V]", line[1]) != None, + self.meta_lines, + ) + ) - for i, c in enumerate(['H', 'R', 'V']): - relevant = list(filter(lambda line : line[1][5] == c, reordering_metalns)) + for i, c in enumerate(["H", "R", "V"]): + relevant = list(filter(lambda line: line[1][5] == c, reordering_metalns)) if len(relevant) == 0: continue if len(relevant) > 1: - self.logger.warn(f"Found multiple order{c} definition lines. Using the last available one.") + self.log.warn( + f"Found multiple order{c} definition lines. Using the last" + " available one." + ) + HapFileValidator.warc += 1 ln = relevant[-1] self.reorder_field_types(i, ln) - - def reorder_field_types(self, tp : int, line : Line): + def reorder_field_types(self, tp: int, line: Line): extpc = len(self.vars_ex[tp].keys()) exclc = line.count - 2 - if (extpc != exclc): - self.leexfl("Not enough columns in extra column reordering", - extpc, - exclc, - line) + if extpc != exclc: + self.leexfl( + "Not enough columns in extra column reordering", extpc, exclc, line + ) self.warnskip(line) + + HapFileValidator.errc += 1 return s = False for col in line.columns[2:]: if not col in self.vars_ex[tp]: self.lefl(f"{col} has not been defined as an extra colunm", line) + HapFileValidator.errc += 1 s = True if s: @@ -672,41 +794,47 @@ def reorder_field_types(self, tp : int, line : Line): for col in line.columns[2:]: self.types_ex[tp].append(self.vars_ex[tp][col]) - # # Logging # - def lefl(self, msg : str, line : Line, sep : str = "\n"): - self.logger.error(f"{TRAIL} {msg}{sep}At line #{line.number}: {line}") + def lefl(self, msg: str, line: Line, sep: str = "\n"): + self.log.error(f"{TRAIL} {msg}{sep}At line #{line.number}: {line}") + def lwfl(self, msg: str, line: Line, sep: str = "\n"): + self.log.warn(f"{TRAIL} {msg}{sep}At line #{line.number}: {line}") - def lwfl(self, msg : str, line : Line, sep : str = "\n"): - self.logger.warn(f"{TRAIL} {msg}{sep}At line #{line.number}: {line}") + def lwexfl(self, msg: str, exp: object, rec: object, line: Line, sep: str = "\n"): + self.log.warning( + f"{TRAIL} {msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}" + ) + def leexfl(self, msg: str, exp: object, rec: object, line: Line, sep: str = "\n"): + self.log.error( + f"{TRAIL} {msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}" + ) - def lwexfl(self, msg : str, exp : object, rec : object, line : Line, sep : str = "\n"): - self.logger.warning( - f"{TRAIL} {msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}") + def warnskip(self, line: Line): + self.log.warning(f"Skipping line #{line.number}") - def leexfl(self, msg : str, exp : object, rec : object, line : Line, sep : str = "\n"): - self.logger.error( - f"{TRAIL} {msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}") +def is_hapfile_valid(filename: Path, sorted=True, logger=None) -> bool: + log = logger - def warnskip(self, line : Line): - self.logger.warning(f"Skipping line #{line.number}") + if log == None: + log = logging.getLogger(LOGGER_NAME) -def is_hapfile_valid(filename : str, sorted = True) -> bool: - file = HapFileIO(filename) + file = HapFileIO(filename, logger=log) + errc = 0 + is_readable = file.validate_existence() if not is_readable: return False - hapfile = HapFile() - hapfile.extract_and_store_content(file, sorted = sorted) + hapfile = HapFileValidator(logger=log) + hapfile.extract_and_store_content(file, sorted=sorted) hapfile.store_ids() @@ -721,5 +849,7 @@ def is_hapfile_valid(filename : str, sorted = True) -> bool: hapfile.validate_version_declarations() - return True + log.info(f"Completed HapFile validation with {HapFileValidator.errc} errors and {HapFileValidator.warc} warnings.") + + return HapFileValidator.errc == 0 and HapFileValidator.warc == 0 From ba5f9d861755ade1137855705447b0fd012a7e45 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Mon, 24 Jul 2023 01:49:14 -0600 Subject: [PATCH 09/44] Fix bug where the validator would break if no repeats were provided --- haptools/val_hapfile.py | 14 ++++---------- tests/data/empty.hap | 1 + 2 files changed, 5 insertions(+), 10 deletions(-) create mode 100644 tests/data/empty.hap diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index c366320e..e99253e1 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -11,7 +11,7 @@ LOGGER_NAME = "valhap" LTS_SPEC = "0.2.0" -TRAIL = "\n>>>" +TRAIL = ">>>" def tmpex(expectation: object, received: object) -> str: @@ -188,16 +188,10 @@ def extract_meta_lines(self, lines: list[Line]): self.meta_lines = lines[:header_limit] def extract_data_lines(self, lines: list[Line]): - limits = [0, 0, 0] - for i, char in enumerate(["H", "R", "V"]): - limits[i] = next( - i for i, line in enumerate(lines) if line[0].startswith(char) - ) - ln = [ - lines[limits[0] : limits[1]], - lines[limits[1] : limits[2]], - lines[limits[2] :], + [ln for ln in lines if ln[0].startswith("H")], + [ln for ln in lines if ln[0].startswith("R")], + [ln for ln in lines if ln[0].startswith("V")] ] for i in range(HapFileValidator.KEY_HAPLOTYPE, HapFileValidator.KEY_VARIANT + 1): diff --git a/tests/data/empty.hap b/tests/data/empty.hap new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/tests/data/empty.hap @@ -0,0 +1 @@ + From af7dbb5dcc4f4c3fd371fff430753935289f98a9 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Mon, 24 Jul 2023 15:52:24 -0600 Subject: [PATCH 10/44] Complete first working instance of the validator. Further Tasks: - Testing - Optimizing - Bug-catching --- haptools/__main__.py | 10 ++++---- haptools/val_hapfile.py | 44 +++++++++++++++++++++++++--------- tests/data/hapfiles/basic.hap | 7 ++++++ tests/data/hapfiles/basic.pvar | 5 ++++ 4 files changed, 50 insertions(+), 16 deletions(-) create mode 100644 tests/data/hapfiles/basic.hap create mode 100644 tests/data/hapfiles/basic.pvar diff --git a/haptools/__main__.py b/haptools/__main__.py index 5d683e68..765e0613 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1036,11 +1036,11 @@ def clump( help="Sorting of the file will not be performed", ) @click.option( - "--pvar", + "--genotypes", type=click.Path(path_type=Path), default=None, - show_default="input file", - help="A .hap file containing sorted and indexed haplotypes and variants", + show_default="optional .pvar file to compare against", + help="A .pvar file containing variant IDs in order to compare them to the .hap file", ) @click.option( "-v", @@ -1053,7 +1053,7 @@ def clump( def validate_hapfile( filename: Path, sort: bool, - pvar: Path, + genotypes: Path | None = None, verbosity: str = "DEBUG", ): @@ -1062,7 +1062,7 @@ def validate_hapfile( log = getLogger(name="validate-hapfile", level = verbosity) - is_valid = val_hapfile.is_hapfile_valid(filename, sorted=sort, logger=log) + is_valid = val_hapfile.is_hapfile_valid(filename, sorted=sort, logger=log, pgen=genotypes) if not is_valid: log.warn("Found several warnings and / or errors in the hapfile") diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index e99253e1..c07c260b 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -6,12 +6,14 @@ from re import search from pathlib import Path -from haptools import logging +from pysam import VariantFile + +from .logging import logging +from .data import GenotypesPLINK LOGGER_NAME = "valhap" LTS_SPEC = "0.2.0" -TRAIL = ">>>" def tmpex(expectation: object, received: object) -> str: @@ -205,7 +207,7 @@ def validate_version_declarations(self): versions = self.extract_version_declarations() if len(versions) == 0: self.log.warn( - f"{TRAIL} No version declaration found. Assuming to use the latest" + f"No version declaration found. Assuming to use the latest" " version." ) HapFileValidator.warc += 1 @@ -220,7 +222,7 @@ def extract_version_declarations(self) -> list[Line]: if len(decls) > 1: self.log.warn( - f"{TRAIL} Found more than one version declaration. Using the last" + f"Found more than one version declaration. Using the last" " instance. Each is its own warning." ) @@ -675,7 +677,7 @@ def validate_variant_ids(self): if no_haplotype: self.log.warn( - f"{TRAIL} Define haplotype '{haplotype}' or fix the variant" + f"Define haplotype '{haplotype}' or fix the variant" " haplotype reference" ) @@ -782,37 +784,50 @@ def reorder_field_types(self, tp: int, line: Line): if s: self.warnskip(line) - return + return self.types_ex[tp].clear() for col in line.columns[2:]: self.types_ex[tp].append(self.vars_ex[tp][col]) + + def compare_haps_to_pvar(self, var_ids : list[str], underscores_to_semicolons : bool = False): + ids : set[tuple[str, Line]] = set() + for chrom, dt in self.vrids.items(): + for k, l in dt.items(): + ids.add((k if not underscores_to_semicolons else k.replace("_", ":"), l)) + + for id, l in ids: + if id not in var_ids: + self.lefl(f"Could not find variant id {id} in the .pvar file!", l) + + HapFileValidator.errc += 1 + # # Logging # def lefl(self, msg: str, line: Line, sep: str = "\n"): - self.log.error(f"{TRAIL} {msg}{sep}At line #{line.number}: {line}") + self.log.error(f"{msg}{sep}At line #{line.number}: {line}") def lwfl(self, msg: str, line: Line, sep: str = "\n"): - self.log.warn(f"{TRAIL} {msg}{sep}At line #{line.number}: {line}") + self.log.warn(f"{msg}{sep}At line #{line.number}: {line}") def lwexfl(self, msg: str, exp: object, rec: object, line: Line, sep: str = "\n"): self.log.warning( - f"{TRAIL} {msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}" + f"{msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}" ) def leexfl(self, msg: str, exp: object, rec: object, line: Line, sep: str = "\n"): self.log.error( - f"{TRAIL} {msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}" + f"{msg}{sep}{tmpex(exp, rec)}{sep}At line #{line.number}: {line}" ) def warnskip(self, line: Line): self.log.warning(f"Skipping line #{line.number}") -def is_hapfile_valid(filename: Path, sorted=True, logger=None) -> bool: +def is_hapfile_valid(filename: Path, sorted : bool = True, pgen : Path | None = None, max_variants : int = 10000, logger = None) -> bool: log = logger if log == None: @@ -843,6 +858,13 @@ def is_hapfile_valid(filename: Path, sorted=True, logger=None) -> bool: hapfile.validate_version_declarations() + if pgen != None: + varfile = GenotypesPLINK(pgen) + varfile.read_variants(max_variants = 1000) + + ids = list(map(lambda v : v[0], varfile.variants)) + hapfile.compare_haps_to_pvar(ids) + log.info(f"Completed HapFile validation with {HapFileValidator.errc} errors and {HapFileValidator.warc} warnings.") return HapFileValidator.errc == 0 and HapFileValidator.warc == 0 diff --git a/tests/data/hapfiles/basic.hap b/tests/data/hapfiles/basic.hap new file mode 100644 index 00000000..85519a05 --- /dev/null +++ b/tests/data/hapfiles/basic.hap @@ -0,0 +1,7 @@ + +# version 0.2.0 +H 21 100 110 haplotype_1 +H 21 110 125 haplotype_2 +V haplotype_1 100 101 variant_1 C +V haplotype_2 110 111 variant_2 A + diff --git a/tests/data/hapfiles/basic.pvar b/tests/data/hapfiles/basic.pvar new file mode 100644 index 00000000..2691d542 --- /dev/null +++ b/tests/data/hapfiles/basic.pvar @@ -0,0 +1,5 @@ +##filedate=20230724 +##contig= +#CHROM POS ID REF ALT +21 100 variant_1 G C +21 110 variant_2 T A From 4c42e690505563028e2d219cf76fc7a256d9fca0 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Mon, 24 Jul 2023 16:02:56 -0600 Subject: [PATCH 11/44] Add a pair of test files to the hapfile directory. Corrected a hapfile. --- tests/data/hapfiles/simple.hap | 13 +++++++++++++ tests/data/hapfiles/simple.pvar | 7 +++++++ tests/data/simple.hap | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 tests/data/hapfiles/simple.hap create mode 100644 tests/data/hapfiles/simple.pvar diff --git a/tests/data/hapfiles/simple.hap b/tests/data/hapfiles/simple.hap new file mode 100644 index 00000000..55ac2c74 --- /dev/null +++ b/tests/data/hapfiles/simple.hap @@ -0,0 +1,13 @@ +# orderH ancestry beta +# version 0.2.0 +#H ancestry s Local ancestry +#H beta .2f Effect size in linear model +H 1 10114 10118 H1 YRI 0.75 +H 1 10114 10119 H2 YRI 0.5 +H 1 10116 10119 H3 ASW 0.25 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/simple.pvar b/tests/data/hapfiles/simple.pvar new file mode 100644 index 00000000..ef40d395 --- /dev/null +++ b/tests/data/hapfiles/simple.pvar @@ -0,0 +1,7 @@ +##filedate=20180225 +##contig= +#CHROM POS ID REF ALT +1 10114 1:10114:T:C T C +1 10116 1:10116:A:G A G +1 10117 1:10117:C:A C A +1 10122 1:10122:A:G A G diff --git a/tests/data/simple.hap b/tests/data/simple.hap index bc327263..55ac2c74 100644 --- a/tests/data/simple.hap +++ b/tests/data/simple.hap @@ -2,7 +2,7 @@ # version 0.2.0 #H ancestry s Local ancestry #H beta .2f Effect size in linear model -H 1 10114 8 H1 YRI 0.75 +H 1 10114 10118 H1 YRI 0.75 H 1 10114 10119 H2 YRI 0.5 H 1 10116 10119 H3 ASW 0.25 V H1 10114 10115 1:10114:T:C T From 269347046705cf2acd2b5b9d691d218c16f33b21 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Mon, 24 Jul 2023 16:08:11 -0600 Subject: [PATCH 12/44] Format files with Black --- haptools/__main__.py | 11 ++++--- haptools/val_hapfile.py | 73 +++++++++++++++++++++++++---------------- 2 files changed, 52 insertions(+), 32 deletions(-) diff --git a/haptools/__main__.py b/haptools/__main__.py index 765e0613..850df37b 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1040,7 +1040,9 @@ def clump( type=click.Path(path_type=Path), default=None, show_default="optional .pvar file to compare against", - help="A .pvar file containing variant IDs in order to compare them to the .hap file", + help=( + "A .pvar file containing variant IDs in order to compare them to the .hap file" + ), ) @click.option( "-v", @@ -1056,13 +1058,14 @@ def validate_hapfile( genotypes: Path | None = None, verbosity: str = "DEBUG", ): - from haptools import val_hapfile from .logging import getLogger - log = getLogger(name="validate-hapfile", level = verbosity) + log = getLogger(name="validate-hapfile", level=verbosity) - is_valid = val_hapfile.is_hapfile_valid(filename, sorted=sort, logger=log, pgen=genotypes) + is_valid = val_hapfile.is_hapfile_valid( + filename, sorted=sort, logger=log, pgen=genotypes + ) if not is_valid: log.warn("Found several warnings and / or errors in the hapfile") diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index c07c260b..a3336106 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -96,8 +96,8 @@ def is_readable(self) -> bool: class HapFileValidator: - errc : int = 0 - warc : int = 0 + errc: int = 0 + warc: int = 0 # H CHROM START END ID MANDATORY_HAPLOTYPE_COLUMN_COUNT: int = 5 @@ -130,37 +130,37 @@ class HapFileValidator: KEY_ID: str = "HT::ID" KEY_ALLELE: str = "HT::Allele" - #DEFAULT_HEADER: dict[int, dict[str, type]] = { + # DEFAULT_HEADER: dict[int, dict[str, type]] = { DEFAULT_HEADER: dict = { KEY_HAPLOTYPE: {}, KEY_REPEAT: {}, KEY_VARIANT: {}, } - #EMPTY_TYPES: dict[int, list[type]] = { + # EMPTY_TYPES: dict[int, list[type]] = { EMPTY_TYPES: dict = { KEY_HAPLOTYPE: [], KEY_REPEAT: [], KEY_VARIANT: [], } - #EMPTY_DATA: dict[int, list[Line]] = { + # EMPTY_DATA: dict[int, list[Line]] = { EMPTY_DATA: dict = { KEY_HAPLOTYPE: [], KEY_REPEAT: [], KEY_VARIANT: [], } - #EMPTY_HRIDS: dict[int, dict[str, Line]] = { + # EMPTY_HRIDS: dict[int, dict[str, Line]] = { EMPTY_HRIDS: dict = { KEY_HAPLOTYPE: {}, KEY_REPEAT: {}, } - #EMPTY_VRIDS: dict[str, dict[str, Line]] = {} + # EMPTY_VRIDS: dict[str, dict[str, Line]] = {} EMPTY_VRIDS: dict = {} - #EMPTY_META: list[Line] = [] + # EMPTY_META: list[Line] = [] EMPTY_META: list = [] def __init__(self, logger=None): @@ -193,10 +193,12 @@ def extract_data_lines(self, lines: list[Line]): ln = [ [ln for ln in lines if ln[0].startswith("H")], [ln for ln in lines if ln[0].startswith("R")], - [ln for ln in lines if ln[0].startswith("V")] + [ln for ln in lines if ln[0].startswith("V")], ] - for i in range(HapFileValidator.KEY_HAPLOTYPE, HapFileValidator.KEY_VARIANT + 1): + for i in range( + HapFileValidator.KEY_HAPLOTYPE, HapFileValidator.KEY_VARIANT + 1 + ): self.data[i] = ln[i] # @@ -207,8 +209,7 @@ def validate_version_declarations(self): versions = self.extract_version_declarations() if len(versions) == 0: self.log.warn( - f"No version declaration found. Assuming to use the latest" - " version." + f"No version declaration found. Assuming to use the latest version." ) HapFileValidator.warc += 1 return @@ -223,7 +224,7 @@ def extract_version_declarations(self) -> list[Line]: if len(decls) > 1: self.log.warn( f"Found more than one version declaration. Using the last" - " instance. Each is its own warning." + f" instance. Each is its own warning." ) for decl in decls: @@ -328,7 +329,7 @@ def retrieve_column_addition_data_type(self, addition: Line) -> type: f"{addition[2]}", addition, ) - + HapFileValidator.errc += 1 return object @@ -376,13 +377,17 @@ def validate_haplotypes(self): def validate_repeats(self): for line in self.data[HapFileValidator.KEY_REPEAT]: - self.check_has_min_cols(line, HapFileValidator.MANDATORY_REPEAT_COLUMN_COUNT) + self.check_has_min_cols( + line, HapFileValidator.MANDATORY_REPEAT_COLUMN_COUNT + ) self.check_start_and_end_positions(line) def validate_variants(self): for line in self.data[HapFileValidator.KEY_VARIANT]: - self.check_has_min_cols(line, HapFileValidator.MANDATORY_VARIANT_COLUMN_COUNT) + self.check_has_min_cols( + line, HapFileValidator.MANDATORY_VARIANT_COLUMN_COUNT + ) self.check_start_and_end_positions(line) self.check_variant_alleles(line) @@ -686,7 +691,9 @@ def validate_variant_ids(self): # def validate_extra_fields(self): - for tp in range(HapFileValidator.KEY_HAPLOTYPE, HapFileValidator.KEY_VARIANT + 1): + for tp in range( + HapFileValidator.KEY_HAPLOTYPE, HapFileValidator.KEY_VARIANT + 1 + ): excol_count = len(self.types_ex[tp]) lines = self.data[tp] @@ -784,18 +791,21 @@ def reorder_field_types(self, tp: int, line: Line): if s: self.warnskip(line) - return + return self.types_ex[tp].clear() for col in line.columns[2:]: self.types_ex[tp].append(self.vars_ex[tp][col]) - - def compare_haps_to_pvar(self, var_ids : list[str], underscores_to_semicolons : bool = False): - ids : set[tuple[str, Line]] = set() + def compare_haps_to_pvar( + self, var_ids: list[str], underscores_to_semicolons: bool = False + ): + ids: set[tuple[str, Line]] = set() for chrom, dt in self.vrids.items(): for k, l in dt.items(): - ids.add((k if not underscores_to_semicolons else k.replace("_", ":"), l)) + ids.add( + (k if not underscores_to_semicolons else k.replace("_", ":"), l) + ) for id, l in ids: if id not in var_ids: @@ -827,13 +837,18 @@ def warnskip(self, line: Line): self.log.warning(f"Skipping line #{line.number}") -def is_hapfile_valid(filename: Path, sorted : bool = True, pgen : Path | None = None, max_variants : int = 10000, logger = None) -> bool: +def is_hapfile_valid( + filename: Path, + sorted: bool = True, + pgen: Path | None = None, + max_variants: int = 10000, + logger=None, +) -> bool: log = logger if log == None: log = logging.getLogger(LOGGER_NAME) - file = HapFileIO(filename, logger=log) errc = 0 @@ -860,12 +875,14 @@ def is_hapfile_valid(filename: Path, sorted : bool = True, pgen : Path | None = if pgen != None: varfile = GenotypesPLINK(pgen) - varfile.read_variants(max_variants = 1000) + varfile.read_variants(max_variants=1000) - ids = list(map(lambda v : v[0], varfile.variants)) + ids = list(map(lambda v: v[0], varfile.variants)) hapfile.compare_haps_to_pvar(ids) - log.info(f"Completed HapFile validation with {HapFileValidator.errc} errors and {HapFileValidator.warc} warnings.") + log.info( + f"Completed HapFile validation with {HapFileValidator.errc} errors and" + f" {HapFileValidator.warc} warnings." + ) return HapFileValidator.errc == 0 and HapFileValidator.warc == 0 - From 79a845b366107fb8959cbf63312cd5df08e0e861 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Wed, 26 Jul 2023 14:28:11 -0600 Subject: [PATCH 13/44] Create test for the validate command --- haptools/val_hapfile.py | 189 +++++++++--------- tests/data/hapfiles/valhap_correct.hap | 19 ++ tests/data/hapfiles/valhap_test_data.hap | 10 + tests/data/hapfiles/valhap_test_data.pvar | 8 + .../valhap_with_10_extras_reordered.hap | 21 ++ .../data/hapfiles/valhap_with_empty_lines.hap | 26 +++ .../valhap_with_inadequate_version.hap | 10 + .../data/hapfiles/valhap_with_no_version.hap | 9 + .../valhap_with_out_of_header_metas.hap | 20 ++ .../valhap_with_unexistent_fields.hap | 11 + .../valhap_with_unexistent_reorders.hap | 12 ++ tests/test.hap | 10 + tests/test.pvar | 8 + tests/test_data.py | 2 +- tests/test_val_hapfile.py | 112 +++++++++++ 15 files changed, 371 insertions(+), 96 deletions(-) create mode 100644 tests/data/hapfiles/valhap_correct.hap create mode 100644 tests/data/hapfiles/valhap_test_data.hap create mode 100644 tests/data/hapfiles/valhap_test_data.pvar create mode 100644 tests/data/hapfiles/valhap_with_10_extras_reordered.hap create mode 100644 tests/data/hapfiles/valhap_with_empty_lines.hap create mode 100644 tests/data/hapfiles/valhap_with_inadequate_version.hap create mode 100644 tests/data/hapfiles/valhap_with_no_version.hap create mode 100644 tests/data/hapfiles/valhap_with_out_of_header_metas.hap create mode 100644 tests/data/hapfiles/valhap_with_unexistent_fields.hap create mode 100644 tests/data/hapfiles/valhap_with_unexistent_reorders.hap create mode 100644 tests/test.hap create mode 100644 tests/test.pvar create mode 100644 tests/test_val_hapfile.py diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index a3336106..340274bd 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -47,9 +47,10 @@ def lines(self, sorted: bool = True) -> list[Line]: buffer = open(self.filename) content = [ - Line(line.strip(), i + 1) for i, line in enumerate(buffer.readlines()) + Line(line.strip(), i + 1) + for i, line in enumerate(buffer.readlines()) + if line and (not line.isspace()) ] - content = list(filter(lambda line: not line.is_empty(), content)) buffer.close() @@ -96,9 +97,6 @@ def is_readable(self) -> bool: class HapFileValidator: - errc: int = 0 - warc: int = 0 - # H CHROM START END ID MANDATORY_HAPLOTYPE_COLUMN_COUNT: int = 5 @@ -130,56 +128,46 @@ class HapFileValidator: KEY_ID: str = "HT::ID" KEY_ALLELE: str = "HT::Allele" - # DEFAULT_HEADER: dict[int, dict[str, type]] = { - DEFAULT_HEADER: dict = { - KEY_HAPLOTYPE: {}, - KEY_REPEAT: {}, - KEY_VARIANT: {}, - } - - # EMPTY_TYPES: dict[int, list[type]] = { - EMPTY_TYPES: dict = { - KEY_HAPLOTYPE: [], - KEY_REPEAT: [], - KEY_VARIANT: [], - } - - # EMPTY_DATA: dict[int, list[Line]] = { - EMPTY_DATA: dict = { - KEY_HAPLOTYPE: [], - KEY_REPEAT: [], - KEY_VARIANT: [], - } - - # EMPTY_HRIDS: dict[int, dict[str, Line]] = { - EMPTY_HRIDS: dict = { - KEY_HAPLOTYPE: {}, - KEY_REPEAT: {}, - } - - # EMPTY_VRIDS: dict[str, dict[str, Line]] = {} - EMPTY_VRIDS: dict = {} - - # EMPTY_META: list[Line] = [] - EMPTY_META: list = [] - def __init__(self, logger=None): self.log = logger or logging.getLogger(LOGGER_NAME) - self.vars_ex: dict[int, dict[str, type]] = HapFileValidator.DEFAULT_HEADER - self.types_ex: dict[int, list[type]] = HapFileValidator.EMPTY_TYPES + self.vars_ex: dict[int, dict[str, type]] = { + HapFileValidator.KEY_HAPLOTYPE: {}, + HapFileValidator.KEY_REPEAT: {}, + HapFileValidator.KEY_VARIANT: {}, + } + + self.types_ex: dict[int, list[type]] = { + HapFileValidator.KEY_HAPLOTYPE: [], + HapFileValidator.KEY_REPEAT: [], + HapFileValidator.KEY_VARIANT: [], + } - self.meta: list[Line] = HapFileValidator.EMPTY_META - self.data: dict[int, list[Line]] = HapFileValidator.EMPTY_DATA + self.meta: list[Line] = [] + self.data: dict[int, list[Line]] = { + HapFileValidator.KEY_HAPLOTYPE: [], + HapFileValidator.KEY_REPEAT: [], + HapFileValidator.KEY_VARIANT: [], + } - self.hrids: dict[int, dict[str, Line]] = HapFileValidator.EMPTY_HRIDS - self.vrids: dict[str, dict[str, Line]] = HapFileValidator.EMPTY_VRIDS + self.hrids: dict[int, dict[str, Line]] = { + HapFileValidator.KEY_HAPLOTYPE: {}, + HapFileValidator.KEY_REPEAT: {}, + } + + self.vrids: dict[str, dict[str, Line]] = {} self.referenced_chromosomes: set[str] = set() + self.errc: int = 0 + self.warc: int = 0 + def extract_and_store_content(self, file: HapFileIO, sorted: bool = True): lines = file.lines(sorted=sorted) + for line in lines: + print(line.content) + self.extract_meta_lines(lines) self.extract_data_lines(lines) @@ -194,8 +182,13 @@ def extract_data_lines(self, lines: list[Line]): [ln for ln in lines if ln[0].startswith("H")], [ln for ln in lines if ln[0].startswith("R")], [ln for ln in lines if ln[0].startswith("V")], + [ln for ln in lines if ln[0][0] not in ['H', 'R', 'V', '#']], ] + for l in ln[3]: + self.lefl("Unrecognized field type. Must be one of 'H', 'R' or 'V'.", l) + self.errc += 1 + for i in range( HapFileValidator.KEY_HAPLOTYPE, HapFileValidator.KEY_VARIANT + 1 ): @@ -208,10 +201,10 @@ def extract_data_lines(self, lines: list[Line]): def validate_version_declarations(self): versions = self.extract_version_declarations() if len(versions) == 0: - self.log.warn( + self.log.warning( f"No version declaration found. Assuming to use the latest version." ) - HapFileValidator.warc += 1 + self.warc += 1 return self.validate_version_format(versions[-1]) @@ -222,13 +215,13 @@ def extract_version_declarations(self) -> list[Line]: ) if len(decls) > 1: - self.log.warn( + self.log.warning( f"Found more than one version declaration. Using the last" f" instance. Each is its own warning." ) for decl in decls: - HapFileValidator.warc += 1 + self.warc += 1 self.lwfl("", decl, sep="") return decls @@ -243,7 +236,7 @@ def validate_version_format(self, version: Line): ) self.warnskip(version) - HapFileValidator.errc += 1 + self.errc += 1 return if search(r"\d+\.\d+\.\d+", version[2]) == None: @@ -254,7 +247,7 @@ def validate_version_format(self, version: Line): version, ) - HapFileValidator.errc += 1 + self.errc += 1 # # Column additions @@ -284,7 +277,7 @@ def find_column_additions(self) -> list[Line]: f"A column addition for '{ln[0][1]}', whose type doesn't exist", ln, ) - HapFileValidator.errc += 1 + self.errc += 1 return additions @@ -299,7 +292,7 @@ def add_column_additions_to_header(self, tp: int, additions: list[Line]): ) self.warnskip(addition) - HapFileValidator.errc += 1 + self.errc += 1 return ptp = self.retrieve_column_addition_data_type(addition) @@ -330,7 +323,7 @@ def retrieve_column_addition_data_type(self, addition: Line) -> type: addition, ) - HapFileValidator.errc += 1 + self.errc += 1 return object # @@ -359,7 +352,7 @@ def validate_haplotypes(self): ) self.warnskip(line) - HapFileValidator.warc += 1 + self.warc += 1 return variant_refs = self.vrids.get(line[4]) @@ -372,7 +365,7 @@ def validate_haplotypes(self): line, ) - HapFileValidator.errc += 1 + self.errc += 1 return def validate_repeats(self): @@ -401,7 +394,7 @@ def check_has_min_cols(self, line: Line, min: int) -> bool: line, ) - HapFileValidator.errc += 1 + self.errc += 1 return False return True @@ -413,7 +406,7 @@ def check_start_and_end_positions(self, line: Line): ) self.warnskip(line) - HapFileValidator.warc += 1 + self.warc += 1 return f = False @@ -426,7 +419,7 @@ def check_start_and_end_positions(self, line: Line): line, ) - HapFileValidator.errc += 1 + self.errc += 1 f = True if not line[3].isdigit(): @@ -437,7 +430,7 @@ def check_start_and_end_positions(self, line: Line): line, ) - HapFileValidator.errc += 1 + self.errc += 1 f = True if f: @@ -448,7 +441,7 @@ def check_start_and_end_positions(self, line: Line): ) self.warnskip(line) - HapFileValidator.warc += 1 + self.warc += 1 return start = int(line[2]) @@ -462,7 +455,7 @@ def check_start_and_end_positions(self, line: Line): line, ) - HapFileValidator.errc += 1 + self.errc += 1 if line.count < 5: self.lwexfl( @@ -474,7 +467,7 @@ def check_start_and_end_positions(self, line: Line): ) self.warnskip(line) - HapFileValidator.warc += 1 + self.warc += 1 return variant_refs = self.vrids.get(line[4]) @@ -492,7 +485,7 @@ def check_start_and_end_positions(self, line: Line): ) self.warnskip(line) - HapFileValidator.errc += 1 + self.errc += 1 return if not ln[3].isdigit(): @@ -504,7 +497,7 @@ def check_start_and_end_positions(self, line: Line): ) self.warnskip(line) - HapFileValidator.errc += 1 + self.errc += 1 return vstart = int(ln[2]) @@ -519,9 +512,9 @@ def check_start_and_end_positions(self, line: Line): f" {start - vstart}", line, ) - self.log.warn(f"At Line #{ln.number}: {ln}") + self.log.warning(f"At Line #{ln.number}: {ln}") - HapFileValidator.errc += 1 + self.errc += 1 if vend > end: self.leexfl( @@ -532,9 +525,9 @@ def check_start_and_end_positions(self, line: Line): f" {vend - end}", line, ) - self.log.warn(f"At Line #{ln.number}: {ln}") + self.log.warning(f"At Line #{ln.number}: {ln}") - HapFileValidator.errc += 1 + self.errc += 1 def check_variant_alleles(self, line: Line): if line.count < HapFileValidator.MANDATORY_VARIANT_COLUMN_COUNT: @@ -546,7 +539,7 @@ def check_variant_alleles(self, line: Line): ) self.warnskip(line) - HapFileValidator.warc += 1 + self.warc += 1 return if line[5].upper() not in ["A", "C", "G", "T"]: @@ -557,7 +550,7 @@ def check_variant_alleles(self, line: Line): line, ) - HapFileValidator.errc += 1 + self.errc += 1 # # ID Storage @@ -581,7 +574,7 @@ def store_hrid(self, tp: int, line: Line): line, ) - HapFileValidator.warc += 1 + self.warc += 1 should_skip = True if line.count < 5: @@ -592,7 +585,7 @@ def store_hrid(self, tp: int, line: Line): line, ) - HapFileValidator.warc += 1 + self.warc += 1 should_skip = True if should_skip: @@ -603,20 +596,25 @@ def store_hrid(self, tp: int, line: Line): if line[4] in self.hrids[tp]: self.leexfl("Duplicate ID.", "A unique ID", f"'{line[4]}'", line) - self.log.warn( + self.log.warning( f"Originally defined at: line #{self.hrids[tp][line[4]].number}" + f"\n:: {self.hrids[tp][line[4]].content}" ) + for k1, v1 in self.hrids.items(): + for k2, v2 in v1.items(): + print(k2, ":", v2.content) + self.warnskip(line) - HapFileValidator.errc += 1 + self.errc += 1 return if line[4] in self.referenced_chromosomes: self.lwfl(f"ID '{line[4]}' is already registered as a chromosome.", line) self.warnskip(line) - HapFileValidator.warc += 1 + self.warc += 1 return self.hrids[tp].update({line[4]: line}) @@ -629,7 +627,7 @@ def store_variant_id(self, line: Line): line.count, line, ) - HapFileValidator.warc += 1 + self.warc += 1 if not line[1] in self.vrids.keys(): self.vrids.update({line[1]: {}}) @@ -641,19 +639,21 @@ def store_variant_id(self, line: Line): f"'{line[4]}'", line, ) - self.log.warn( + self.log.warning( f"Originally defined at: line #{self.vrids[line[1]][line[4]].number}" + f"\n{self.vrids[line[1]][line[4]].content}" ) + self.warnskip(line) - HapFileValidator.errc += 1 + self.errc += 1 return if line[4] in self.referenced_chromosomes: self.lwfl(f"ID '{line[4]}' is already registered as a chromosome.", line) self.warnskip(line) - HapFileValidator.warc += 1 + self.warc += 1 return self.vrids[line[1]].update({line[4]: line}) @@ -677,11 +677,11 @@ def validate_variant_ids(self): ) no_haplotype = True - HapFileValidator.errc += 1 + self.errc += 1 continue if no_haplotype: - self.log.warn( + self.log.warning( f"Define haplotype '{haplotype}' or fix the variant" " haplotype reference" ) @@ -710,11 +710,11 @@ def validate_extra_fields(self): if extras < 0: self.lefl("There aren't even enough mandatory columns", line) - HapFileValidator.warc += 1 + self.warc += 1 self.warnskip(line) - HapFileValidator.warc += 1 + self.warc += 1 continue for ptp, col in zip(self.types_ex[tp], line.columns[rs:]): @@ -729,7 +729,7 @@ def validate_extra_fields(self): line, ) - HapFileValidator.errc += 1 + self.errc += 1 def determine_if_is_convertible(self, what: str, tp: type) -> bool: if tp == int: @@ -759,11 +759,11 @@ def reorder_extra_fields(self): continue if len(relevant) > 1: - self.log.warn( + self.log.warning( f"Found multiple order{c} definition lines. Using the last" " available one." ) - HapFileValidator.warc += 1 + self.warc += 1 ln = relevant[-1] @@ -779,14 +779,14 @@ def reorder_field_types(self, tp: int, line: Line): ) self.warnskip(line) - HapFileValidator.errc += 1 + self.errc += 1 return s = False for col in line.columns[2:]: if not col in self.vars_ex[tp]: self.lefl(f"{col} has not been defined as an extra colunm", line) - HapFileValidator.errc += 1 + self.errc += 1 s = True if s: @@ -811,7 +811,7 @@ def compare_haps_to_pvar( if id not in var_ids: self.lefl(f"Could not find variant id {id} in the .pvar file!", l) - HapFileValidator.errc += 1 + self.errc += 1 # # Logging @@ -821,7 +821,7 @@ def lefl(self, msg: str, line: Line, sep: str = "\n"): self.log.error(f"{msg}{sep}At line #{line.number}: {line}") def lwfl(self, msg: str, line: Line, sep: str = "\n"): - self.log.warn(f"{msg}{sep}At line #{line.number}: {line}") + self.log.warning(f"{msg}{sep}At line #{line.number}: {line}") def lwexfl(self, msg: str, exp: object, rec: object, line: Line, sep: str = "\n"): self.log.warning( @@ -850,7 +850,6 @@ def is_hapfile_valid( log = logging.getLogger(LOGGER_NAME) file = HapFileIO(filename, logger=log) - errc = 0 is_readable = file.validate_existence() @@ -875,14 +874,14 @@ def is_hapfile_valid( if pgen != None: varfile = GenotypesPLINK(pgen) - varfile.read_variants(max_variants=1000) + varfile.read_variants(max_variants=max_variants) ids = list(map(lambda v: v[0], varfile.variants)) hapfile.compare_haps_to_pvar(ids) log.info( - f"Completed HapFile validation with {HapFileValidator.errc} errors and" - f" {HapFileValidator.warc} warnings." + f"Completed HapFile validation with {hapfile.errc} errors and" + f" {hapfile.warc} warnings." ) - return HapFileValidator.errc == 0 and HapFileValidator.warc == 0 + return hapfile.errc == 0 and hapfile.warc == 0 diff --git a/tests/data/hapfiles/valhap_correct.hap b/tests/data/hapfiles/valhap_correct.hap new file mode 100644 index 00000000..717dce16 --- /dev/null +++ b/tests/data/hapfiles/valhap_correct.hap @@ -0,0 +1,19 @@ +# orderH ancestry beta +# version 0.2.0 +#H ancestry s Local ancestry +#H beta .2f Effect size in linear model +#R beta .2f Effect size in linear model +H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 +R 21 26938353 26938400 21_26938353_STR 0.45 +H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 +H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 +V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C +V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C +V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G +V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A +V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T +V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A +V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A + diff --git a/tests/data/hapfiles/valhap_test_data.hap b/tests/data/hapfiles/valhap_test_data.hap new file mode 100644 index 00000000..2ecefe21 --- /dev/null +++ b/tests/data/hapfiles/valhap_test_data.hap @@ -0,0 +1,10 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_test_data.pvar b/tests/data/hapfiles/valhap_test_data.pvar new file mode 100644 index 00000000..ca962b0b --- /dev/null +++ b/tests/data/hapfiles/valhap_test_data.pvar @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10114 1:10114:T:C T C . . . +1 10116 1:10116:A:G A G . . . +1 10117 1:10117:C:A C A . . . +1 10122 1:10122:A:G A G . . . diff --git a/tests/data/hapfiles/valhap_with_10_extras_reordered.hap b/tests/data/hapfiles/valhap_with_10_extras_reordered.hap new file mode 100644 index 00000000..f22c29da --- /dev/null +++ b/tests/data/hapfiles/valhap_with_10_extras_reordered.hap @@ -0,0 +1,21 @@ +# version 0.2.0 +#H extra4 s +#H extra1 s +#H extra2 s +#H extra3 s +#H extra0 d +#H extra5 s +#H extra6 s +#H extra9 d +#H extra8 s +#H extra7 s +# orderH extra0 extra1 extra2 extra3 extra4 extra5 extra6 extra7 extra8 extra9 +H 1 10114 10118 H1 0 b c d e f g h i 9 +H 1 10114 10119 H2 0 b c d e f g h i 9 +H 1 10116 10119 H3 0 b c d e f g h i 9 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_empty_lines.hap b/tests/data/hapfiles/valhap_with_empty_lines.hap new file mode 100644 index 00000000..32e2914b --- /dev/null +++ b/tests/data/hapfiles/valhap_with_empty_lines.hap @@ -0,0 +1,26 @@ +# orderH ancestry beta +# version 0.2.0 + +#H ancestry s Local ancestry +#H beta .2f Effect size in linear model + +#R beta .2f Effect size in linear model + +H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 +R 21 26938353 26938400 21_26938353_STR 0.45 + +H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 +H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 + +V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C +V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C +V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G + +V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A +V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T +V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A + +V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A + diff --git a/tests/data/hapfiles/valhap_with_inadequate_version.hap b/tests/data/hapfiles/valhap_with_inadequate_version.hap new file mode 100644 index 00000000..010eadd1 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_inadequate_version.hap @@ -0,0 +1,10 @@ +# version 0.n.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_no_version.hap b/tests/data/hapfiles/valhap_with_no_version.hap new file mode 100644 index 00000000..32475610 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_no_version.hap @@ -0,0 +1,9 @@ +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_out_of_header_metas.hap b/tests/data/hapfiles/valhap_with_out_of_header_metas.hap new file mode 100644 index 00000000..38676656 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_out_of_header_metas.hap @@ -0,0 +1,20 @@ +# orderH ancestry beta +# version 0.2.0 +#H ancestry s Local ancestry +#H beta .2f Effect size in linear model +#R beta .2f Effect size in linear model +H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 +R 21 26938353 26938400 21_26938353_STR 0.45 +H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 +H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 +# This should cause an error if the file is sorted +#V test_field s A field to test with +V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C +V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C +V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G +V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A +V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T +V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A +V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A diff --git a/tests/data/hapfiles/valhap_with_unexistent_fields.hap b/tests/data/hapfiles/valhap_with_unexistent_fields.hap new file mode 100644 index 00000000..1b7c97f8 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_unexistent_fields.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A +X 10 15 144 XID1 diff --git a/tests/data/hapfiles/valhap_with_unexistent_reorders.hap b/tests/data/hapfiles/valhap_with_unexistent_reorders.hap new file mode 100644 index 00000000..5fba544a --- /dev/null +++ b/tests/data/hapfiles/valhap_with_unexistent_reorders.hap @@ -0,0 +1,12 @@ +# version 0.2.0 +# Should error out! +# orderR x y z +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/test.hap b/tests/test.hap new file mode 100644 index 00000000..1b06286a --- /dev/null +++ b/tests/test.hap @@ -0,0 +1,10 @@ +# version 0.2.0 +H 1 10114 8 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/test.pvar b/tests/test.pvar new file mode 100644 index 00000000..ca962b0b --- /dev/null +++ b/tests/test.pvar @@ -0,0 +1,8 @@ +##fileformat=VCFv4.2 +##FILTER= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10114 1:10114:T:C T C . . . +1 10116 1:10116:A:G A G . . . +1 10117 1:10117:C:A C A . . . +1 10122 1:10122:A:G A G . . . diff --git a/tests/test_data.py b/tests/test_data.py index 2932d146..4af4f2c0 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -920,7 +920,7 @@ def _basic_unordered_first_field_haps(self): def _get_dummy_haps(self): # create three haplotypes haplotypes = { - "H1": Haplotype(chrom="1", start=10114, end=8, id="H1"), + "H1": Haplotype(chrom="1", start=10114, end=10118, id="H1"), "H2": Haplotype(chrom="1", start=10114, end=10119, id="H2"), "H3": Haplotype(chrom="1", start=10116, end=10119, id="H3"), } diff --git a/tests/test_val_hapfile.py b/tests/test_val_hapfile.py new file mode 100644 index 00000000..6098cefc --- /dev/null +++ b/tests/test_val_hapfile.py @@ -0,0 +1,112 @@ +import os +from pathlib import Path + +import pytest + +from . import test_data +from haptools import val_hapfile +from haptools import data + +DATADIR = Path(__file__).parent.joinpath("data").joinpath("hapfiles") + + +def _generate_fake_haps(): + haps_ = test_data.TestHaplotypes() + haps = haps_._get_dummy_haps() + haps.fname = Path(DATADIR / "valhap_test_data.hap") + haps.write() + + +def _generate_fake_vars(): + vars_ = test_data.TestGenotypesPLINK() + vars = vars_._get_fake_genotypes_plink() + vars.fname = Path(DATADIR / "valhap_test_data.plink") + vars.write_variants() + + +def test_generated_haplotypes(): + _generate_fake_haps() + _generate_fake_vars() + + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_test_data.hap", pgen=DATADIR / "valhap_test_data.pvar" + ) + == True + ) + + +def test_with_empty_lines(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_empty_lines.hap", + ) + == True + ) + + +def test_with_out_of_header_metas_sorted(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_out_of_header_metas.hap", sorted=True + ) + == False + ) + + +def test_with_out_of_header_metas_unsorted(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_out_of_header_metas.hap", sorted=False + ) + == True + ) + + +def test_with_10_extras_reordered(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_10_extras_reordered.hap" + ) + == True + ) + + +def test_with_unexistent_reorders(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_unexistent_reorders.hap" + ) + == False + ) + + +def test_with_unexistent_fields(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_unexistent_fields.hap" + ) + == False + ) + + +def test_with_inadequate_version(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_inadequate_version.hap" + ) + == False + ) + + +def test_with_no_version(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_no_version.hap" + ) + == False + ) + + +def test_unreadable_hapfile(): + assert val_hapfile.is_hapfile_valid(Path("NON_EXISTENT_FILENAME.hap")) == False From 76176c58e45a8558b3657b8a3bb295dfb904bd4b Mon Sep 17 00:00:00 2001 From: Ayimany Date: Wed, 26 Jul 2023 16:58:46 -0600 Subject: [PATCH 14/44] Create tests for validation command --- haptools/val_hapfile.py | 30 +-- tests/data/hapfiles/basic.hap | 1 - tests/data/hapfiles/basic_missing_ids.pvar | 4 + .../hapfiles/valhap_with_duplicate_ids.hap | 5 + ...lhap_with_duplicate_vids_per_haplotype.hap | 4 + .../valhap_with_excol_of_wrong_type.hap | 4 + .../valhap_with_hrid_of_chromosome.hap | 8 + ...valhap_with_inadequate_version_columns.hap | 10 + .../valhap_with_inconvertible_ends.hap | 4 + .../valhap_with_inconvertible_ends_var.hap | 4 + .../valhap_with_inconvertible_starts.hap | 4 + .../valhap_with_inconvertible_starts_var.hap | 4 + .../valhap_with_insufficient_columns.hap | 16 ++ ...ap_with_insufficient_excols_in_reorder.hap | 6 + ...h_invalid_column_addition_column_count.hap | 11 + ...ith_invalid_column_addition_data_types.hap | 11 + ...hap_with_invalid_column_addition_types.hap | 11 + .../valhap_with_multiple_order_defs.hap | 11 + .../valhap_with_multiple_versions.hap | 12 ++ .../hapfiles/valhap_with_start_after_end.hap | 10 + .../valhap_with_unassociated_haplotype.hap | 11 + .../valhap_with_unexistent_col_in_order.hap | 19 ++ .../valhap_with_unexistent_reorders.hap | 12 +- .../valhap_with_unrecognizable_allele.hap | 3 + .../valhap_with_variant_id_of_chromosome.hap | 11 + ...p_with_variant_inexistent_haplotype_id.hap | 12 ++ tests/test_val_hapfile.py | 193 +++++++++++++++++- 27 files changed, 400 insertions(+), 31 deletions(-) create mode 100644 tests/data/hapfiles/basic_missing_ids.pvar create mode 100644 tests/data/hapfiles/valhap_with_duplicate_ids.hap create mode 100644 tests/data/hapfiles/valhap_with_duplicate_vids_per_haplotype.hap create mode 100644 tests/data/hapfiles/valhap_with_excol_of_wrong_type.hap create mode 100644 tests/data/hapfiles/valhap_with_hrid_of_chromosome.hap create mode 100644 tests/data/hapfiles/valhap_with_inadequate_version_columns.hap create mode 100644 tests/data/hapfiles/valhap_with_inconvertible_ends.hap create mode 100644 tests/data/hapfiles/valhap_with_inconvertible_ends_var.hap create mode 100644 tests/data/hapfiles/valhap_with_inconvertible_starts.hap create mode 100644 tests/data/hapfiles/valhap_with_inconvertible_starts_var.hap create mode 100644 tests/data/hapfiles/valhap_with_insufficient_columns.hap create mode 100644 tests/data/hapfiles/valhap_with_insufficient_excols_in_reorder.hap create mode 100644 tests/data/hapfiles/valhap_with_invalid_column_addition_column_count.hap create mode 100644 tests/data/hapfiles/valhap_with_invalid_column_addition_data_types.hap create mode 100644 tests/data/hapfiles/valhap_with_invalid_column_addition_types.hap create mode 100644 tests/data/hapfiles/valhap_with_multiple_order_defs.hap create mode 100644 tests/data/hapfiles/valhap_with_multiple_versions.hap create mode 100644 tests/data/hapfiles/valhap_with_start_after_end.hap create mode 100644 tests/data/hapfiles/valhap_with_unassociated_haplotype.hap create mode 100644 tests/data/hapfiles/valhap_with_unexistent_col_in_order.hap create mode 100644 tests/data/hapfiles/valhap_with_unrecognizable_allele.hap create mode 100644 tests/data/hapfiles/valhap_with_variant_id_of_chromosome.hap create mode 100644 tests/data/hapfiles/valhap_with_variant_inexistent_haplotype_id.hap diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index 340274bd..35962edd 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -28,9 +28,6 @@ def __init__(self, content: str, number: int): self.columns: list[str] = content.split() self.count: int = len(self.columns) - def is_empty(self) -> bool: - return self.count == 0 - def __getitem__(self, index: int) -> str: return self.columns[index] @@ -182,7 +179,7 @@ def extract_data_lines(self, lines: list[Line]): [ln for ln in lines if ln[0].startswith("H")], [ln for ln in lines if ln[0].startswith("R")], [ln for ln in lines if ln[0].startswith("V")], - [ln for ln in lines if ln[0][0] not in ['H', 'R', 'V', '#']], + [ln for ln in lines if ln[0][0] not in ["H", "R", "V", "#"]], ] for l in ln[3]: @@ -400,7 +397,7 @@ def check_has_min_cols(self, line: Line, min: int) -> bool: return True def check_start_and_end_positions(self, line: Line): - if line.count < 3: + if line.count < 4: self.lwfl( "Cannot validate start and end positions: Insufficient columns", line ) @@ -575,7 +572,8 @@ def store_hrid(self, tp: int, line: Line): ) self.warc += 1 - should_skip = True + self.warnskip(line) + return if line.count < 5: self.lwexfl( @@ -585,11 +583,9 @@ def store_hrid(self, tp: int, line: Line): line, ) - self.warc += 1 - should_skip = True - - if should_skip: self.warnskip(line) + + self.warc += 1 return self.referenced_chromosomes.add(line[1]) @@ -627,7 +623,11 @@ def store_variant_id(self, line: Line): line.count, line, ) + + self.warnskip(line) + self.warc += 1 + return if not line[1] in self.vrids.keys(): self.vrids.update({line[1]: {}}) @@ -662,9 +662,6 @@ def store_variant_id(self, line: Line): # Variant Validation # - def validate_variants_against_haplotypes(self): - self.validate_variant_ids() - def validate_variant_ids(self): for haplotype, ids in self.vrids.items(): no_haplotype = False @@ -773,9 +770,12 @@ def reorder_field_types(self, tp: int, line: Line): extpc = len(self.vars_ex[tp].keys()) exclc = line.count - 2 - if extpc != exclc: + if extpc > exclc: self.leexfl( - "Not enough columns in extra column reordering", extpc, exclc, line + "Not enough columns in extra column reordering", + extpc + 2, + exclc + 2, + line, ) self.warnskip(line) diff --git a/tests/data/hapfiles/basic.hap b/tests/data/hapfiles/basic.hap index 85519a05..42ed6568 100644 --- a/tests/data/hapfiles/basic.hap +++ b/tests/data/hapfiles/basic.hap @@ -1,4 +1,3 @@ - # version 0.2.0 H 21 100 110 haplotype_1 H 21 110 125 haplotype_2 diff --git a/tests/data/hapfiles/basic_missing_ids.pvar b/tests/data/hapfiles/basic_missing_ids.pvar new file mode 100644 index 00000000..30c45929 --- /dev/null +++ b/tests/data/hapfiles/basic_missing_ids.pvar @@ -0,0 +1,4 @@ +##filedate=20230724 +##contig= +#CHROM POS ID REF ALT +21 100 variant_1 G C diff --git a/tests/data/hapfiles/valhap_with_duplicate_ids.hap b/tests/data/hapfiles/valhap_with_duplicate_ids.hap new file mode 100644 index 00000000..9647dcf0 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_duplicate_ids.hap @@ -0,0 +1,5 @@ +# version 0.2.0 +H 21 101 110 H1 +H 21 111 120 H1 +H 21 121 130 H1 +V H1 22 99 V:21:22:99:C:G X diff --git a/tests/data/hapfiles/valhap_with_duplicate_vids_per_haplotype.hap b/tests/data/hapfiles/valhap_with_duplicate_vids_per_haplotype.hap new file mode 100644 index 00000000..e6af8d79 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_duplicate_vids_per_haplotype.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +H 21 101 110 H1 +V H1 33 33 V:21:22:99:C:G G +V H1 22 22 V:21:22:99:C:G G diff --git a/tests/data/hapfiles/valhap_with_excol_of_wrong_type.hap b/tests/data/hapfiles/valhap_with_excol_of_wrong_type.hap new file mode 100644 index 00000000..f1cb3082 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_excol_of_wrong_type.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +#H extraField d +H 1 10114 10118 H1 NOT_AN_INTEGER +V H1 10116 10117 1:10116:A:G G \ No newline at end of file diff --git a/tests/data/hapfiles/valhap_with_hrid_of_chromosome.hap b/tests/data/hapfiles/valhap_with_hrid_of_chromosome.hap new file mode 100644 index 00000000..c4cfa33c --- /dev/null +++ b/tests/data/hapfiles/valhap_with_hrid_of_chromosome.hap @@ -0,0 +1,8 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 1 +R 1 10116 10119 1 +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C diff --git a/tests/data/hapfiles/valhap_with_inadequate_version_columns.hap b/tests/data/hapfiles/valhap_with_inadequate_version_columns.hap new file mode 100644 index 00000000..c712d1e1 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_inadequate_version_columns.hap @@ -0,0 +1,10 @@ +# version +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_inconvertible_ends.hap b/tests/data/hapfiles/valhap_with_inconvertible_ends.hap new file mode 100644 index 00000000..77be1a42 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_inconvertible_ends.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +H 1 10114 101x19 H1 +R 1 10116 101x19 R1 +V H1 10114 10115 1:10114:T:C T diff --git a/tests/data/hapfiles/valhap_with_inconvertible_ends_var.hap b/tests/data/hapfiles/valhap_with_inconvertible_ends_var.hap new file mode 100644 index 00000000..9d6544aa --- /dev/null +++ b/tests/data/hapfiles/valhap_with_inconvertible_ends_var.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +H 1 10114 10119 H1 +R 1 10116 10119 R1 +V H1 10114 101x15 1:10114:T:C T diff --git a/tests/data/hapfiles/valhap_with_inconvertible_starts.hap b/tests/data/hapfiles/valhap_with_inconvertible_starts.hap new file mode 100644 index 00000000..de0729c1 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_inconvertible_starts.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +H 1 101x14 10119 H1 +R 1 101x16 10119 R1 +V H1 10114 10115 1:10114:T:C T diff --git a/tests/data/hapfiles/valhap_with_inconvertible_starts_var.hap b/tests/data/hapfiles/valhap_with_inconvertible_starts_var.hap new file mode 100644 index 00000000..1b61ba06 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_inconvertible_starts_var.hap @@ -0,0 +1,4 @@ +# version 0.2.0 +H 1 10114 10119 H1 +R 1 10116 10119 R1 +V H1 101x14 10115 1:10114:T:C T diff --git a/tests/data/hapfiles/valhap_with_insufficient_columns.hap b/tests/data/hapfiles/valhap_with_insufficient_columns.hap new file mode 100644 index 00000000..8a99185b --- /dev/null +++ b/tests/data/hapfiles/valhap_with_insufficient_columns.hap @@ -0,0 +1,16 @@ +# version 0.2.0 +H +H 1 +H 1 10114 +H 1 10114 10118 +H 1 10114 10118 H1 +R +R 1 +R 1 10115 +R 1 10115 10119 +R 1 10115 10119 R1 +V +V H1 +V H1 10117 +V H1 10117 10118 +V H1 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_insufficient_excols_in_reorder.hap b/tests/data/hapfiles/valhap_with_insufficient_excols_in_reorder.hap new file mode 100644 index 00000000..5ca68ec4 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_insufficient_excols_in_reorder.hap @@ -0,0 +1,6 @@ +# version 0.2.0 +#H extraField1 d +#H extraField2 d +# orderH extraField1 +H 1 10114 10118 H1 NOT_AN_INTEGER +V H1 10116 10117 1:10116:A:G G \ No newline at end of file diff --git a/tests/data/hapfiles/valhap_with_invalid_column_addition_column_count.hap b/tests/data/hapfiles/valhap_with_invalid_column_addition_column_count.hap new file mode 100644 index 00000000..f28769d5 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_invalid_column_addition_column_count.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +#H justTheName +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_invalid_column_addition_data_types.hap b/tests/data/hapfiles/valhap_with_invalid_column_addition_data_types.hap new file mode 100644 index 00000000..1562948f --- /dev/null +++ b/tests/data/hapfiles/valhap_with_invalid_column_addition_data_types.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +#H invalidType x <- Wrong! +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_invalid_column_addition_types.hap b/tests/data/hapfiles/valhap_with_invalid_column_addition_types.hap new file mode 100644 index 00000000..226ec1db --- /dev/null +++ b/tests/data/hapfiles/valhap_with_invalid_column_addition_types.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +#X inv s This is invalid! +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_multiple_order_defs.hap b/tests/data/hapfiles/valhap_with_multiple_order_defs.hap new file mode 100644 index 00000000..1940d75b --- /dev/null +++ b/tests/data/hapfiles/valhap_with_multiple_order_defs.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +#H a d +#H b d +#H c d +# orderH a b c +# orderH c b a +H 21 100 110 haplotype_1 1 2 3 +H 21 110 125 haplotype_2 3 2 1 +V haplotype_1 100 101 variant_1 C +V haplotype_2 110 111 variant_2 A + diff --git a/tests/data/hapfiles/valhap_with_multiple_versions.hap b/tests/data/hapfiles/valhap_with_multiple_versions.hap new file mode 100644 index 00000000..2cde8fb3 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_multiple_versions.hap @@ -0,0 +1,12 @@ +# version 0.1.0 +# version 0.2.0 +# version 0.3.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_start_after_end.hap b/tests/data/hapfiles/valhap_with_start_after_end.hap new file mode 100644 index 00000000..8cb2445f --- /dev/null +++ b/tests/data/hapfiles/valhap_with_start_after_end.hap @@ -0,0 +1,10 @@ +# version 0.2.0 +H 1 10118 10114 H1 +H 1 10119 10114 H2 +H 1 10119 10116 H3 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_unassociated_haplotype.hap b/tests/data/hapfiles/valhap_with_unassociated_haplotype.hap new file mode 100644 index 00000000..e9214447 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_unassociated_haplotype.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +H 1 10123 10126 H4 +V H1 10114 10115 1:10114:T:C T +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_unexistent_col_in_order.hap b/tests/data/hapfiles/valhap_with_unexistent_col_in_order.hap new file mode 100644 index 00000000..571ca418 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_unexistent_col_in_order.hap @@ -0,0 +1,19 @@ +# orderH ancestry beta nothing +# version 0.2.0 +#H ancestry s Local ancestry +#H beta .2f Effect size in linear model +#R beta .2f Effect size in linear model +H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 +R 21 26938353 26938400 21_26938353_STR 0.45 +H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 +H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 +V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C +V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C +V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G +V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A +V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T +V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A +V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T +V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A + diff --git a/tests/data/hapfiles/valhap_with_unexistent_reorders.hap b/tests/data/hapfiles/valhap_with_unexistent_reorders.hap index 5fba544a..5a395aec 100644 --- a/tests/data/hapfiles/valhap_with_unexistent_reorders.hap +++ b/tests/data/hapfiles/valhap_with_unexistent_reorders.hap @@ -1,9 +1,11 @@ # version 0.2.0 -# Should error out! -# orderR x y z -H 1 10114 10118 H1 -H 1 10114 10119 H2 -H 1 10116 10119 H3 +#H x d +#H y d +# Should error out! Z does not exist +# orderH x y z +H 1 10114 10118 H1 1 2 +H 1 10114 10119 H2 1 2 +H 1 10116 10119 H3 1 2 V H1 10114 10115 1:10114:T:C T V H1 10116 10117 1:10116:A:G G V H2 10114 10115 1:10114:T:C C diff --git a/tests/data/hapfiles/valhap_with_unrecognizable_allele.hap b/tests/data/hapfiles/valhap_with_unrecognizable_allele.hap new file mode 100644 index 00000000..bf319414 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_unrecognizable_allele.hap @@ -0,0 +1,3 @@ +# version 0.2.0 +H 21 100 110 H1 +V H1 22 99 V:21:22:99:C:G X diff --git a/tests/data/hapfiles/valhap_with_variant_id_of_chromosome.hap b/tests/data/hapfiles/valhap_with_variant_id_of_chromosome.hap new file mode 100644 index 00000000..2a203be9 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_variant_id_of_chromosome.hap @@ -0,0 +1,11 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +R 1 10116 10119 R1 +V H1 10114 10115 1 C +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_with_variant_inexistent_haplotype_id.hap b/tests/data/hapfiles/valhap_with_variant_inexistent_haplotype_id.hap new file mode 100644 index 00000000..dc420011 --- /dev/null +++ b/tests/data/hapfiles/valhap_with_variant_inexistent_haplotype_id.hap @@ -0,0 +1,12 @@ +# version 0.2.0 +H 1 10114 10118 H1 +H 1 10114 10119 H2 +H 1 10116 10119 H3 +R 1 10116 10119 R1 +V H1 10114 10115 1 C +V H1 10116 10117 1:10116:A:G G +V H2 10114 10115 1:10114:T:C C +V H2 10117 10118 1:10117:C:A C +V H3 10116 10117 1:10116:A:G A +V H3 10117 10118 1:10117:C:A A +V H4 10117 10118 1:10117:C:T T diff --git a/tests/test_val_hapfile.py b/tests/test_val_hapfile.py index 6098cefc..3119ecf0 100644 --- a/tests/test_val_hapfile.py +++ b/tests/test_val_hapfile.py @@ -65,44 +65,217 @@ def test_with_out_of_header_metas_unsorted(): def test_with_10_extras_reordered(): assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_10_extras_reordered.hap" - ) + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_10_extras_reordered.hap") == True ) def test_with_unexistent_reorders(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_unexistent_reorders.hap") + == False + ) + + +def test_with_unexistent_fields(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_unexistent_fields.hap") + == False + ) + + +def test_with_inadequate_version(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_inadequate_version.hap") + == False + ) + + +def test_with_no_version(): + assert val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_no_version.hap") == False + + +def test_with_multiple_versions(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_multiple_versions.hap") + == False + ) + + +def test_with_inadequate_version_columns(): assert ( val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_unexistent_reorders.hap" + DATADIR / "valhap_with_inadequate_version_columns.hap" ) == False ) -def test_with_unexistent_fields(): +def test_with_invalid_column_addition_column_count(): assert ( val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_unexistent_fields.hap" + DATADIR / "valhap_with_invalid_column_addition_column_count.hap" ) == False ) -def test_with_inadequate_version(): +def test_with_invalid_column_addition_types(): assert ( val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_inadequate_version.hap" + DATADIR / "valhap_with_invalid_column_addition_types.hap" ) == False ) -def test_with_no_version(): +def test_with_invalid_column_addition_data_types(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_invalid_column_addition_data_types.hap" + ) + == False + ) + + +def test_with_insufficient_columns(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_insufficient_columns.hap") + == False + ) + + +def test_with_inconvertible_starts(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_inconvertible_starts.hap") + == False + ) + + +def test_with_inconvertible_ends(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_inconvertible_ends.hap") + == False + ) + + +def test_with_inconvertible_starts(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_inconvertible_starts_var.hap" + ) + == False + ) + + +def test_with_inconvertible_ends(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_inconvertible_ends_var.hap") + == False + ) + + +def test_valhap_with_start_after_end(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_start_after_end.hap") + == False + ) + + +def test_is_directory(): + assert val_hapfile.is_hapfile_valid(DATADIR / "valhap_is_directory.hap") == False + + +def test_with_variant_id_of_chromosome(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_variant_id_of_chromosome.hap" + ) + == False + ) + + +def test_with_hrid_of_chromosome(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_hrid_of_chromosome.hap") + == False + ) + + +def test_with_unexistent_col_in_order(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_unexistent_col_in_order.hap" + ) + == False + ) + + +def test_with_unassociated_haplotype(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_unassociated_haplotype.hap") + == False + ) + + +def test_with_unrecognizable_allele(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_unrecognizable_allele.hap") + == False + ) + + +def test_with_duplicate_ids(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_duplicate_ids.hap") == False + ) + + +def test_with_duplicate_vids_per_haplotype(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_duplicate_vids_per_haplotype.hap" + ) + == False + ) + + +def test_with_excol_of_wrong_type(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_excol_of_wrong_type.hap") + == False + ) + + +def test_with_multiple_order_defs(): + assert ( + val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_multiple_order_defs.hap") + == False + ) + + +def test_with_insufficient_excols_in_reorder(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_insufficient_excols_in_reorder.hap" + ) + == False + ) + + +def test_with_variant_inexistent_haplotype_id(): + assert ( + val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_variant_inexistent_haplotype_id.hap" + ) + == False + ) + + +def test_with_missing_variant_in_pvar(): assert ( val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_no_version.hap" + DATADIR / "simple.hap", pgen=DATADIR / "basic_missing_ids.pvar" ) == False ) From c6f1f5673c0d2753614354f3a2b0c51b672de6f0 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Wed, 26 Jul 2023 19:10:58 -0600 Subject: [PATCH 15/44] Remove debugging print statements --- haptools/val_hapfile.py | 7 ------- tests/data/hapfiles/valhap_is_directory.hap/keep.git | 0 tests/test_val_hapfile.py | 4 ++-- 3 files changed, 2 insertions(+), 9 deletions(-) create mode 100644 tests/data/hapfiles/valhap_is_directory.hap/keep.git diff --git a/haptools/val_hapfile.py b/haptools/val_hapfile.py index 35962edd..441ef734 100644 --- a/haptools/val_hapfile.py +++ b/haptools/val_hapfile.py @@ -162,9 +162,6 @@ def __init__(self, logger=None): def extract_and_store_content(self, file: HapFileIO, sorted: bool = True): lines = file.lines(sorted=sorted) - for line in lines: - print(line.content) - self.extract_meta_lines(lines) self.extract_data_lines(lines) @@ -597,10 +594,6 @@ def store_hrid(self, tp: int, line: Line): f"\n:: {self.hrids[tp][line[4]].content}" ) - for k1, v1 in self.hrids.items(): - for k2, v2 in v1.items(): - print(k2, ":", v2.content) - self.warnskip(line) self.errc += 1 diff --git a/tests/data/hapfiles/valhap_is_directory.hap/keep.git b/tests/data/hapfiles/valhap_is_directory.hap/keep.git new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_val_hapfile.py b/tests/test_val_hapfile.py index 3119ecf0..dd4e1dba 100644 --- a/tests/test_val_hapfile.py +++ b/tests/test_val_hapfile.py @@ -159,7 +159,7 @@ def test_with_inconvertible_ends(): ) -def test_with_inconvertible_starts(): +def test_with_inconvertible_starts_var(): assert ( val_hapfile.is_hapfile_valid( DATADIR / "valhap_with_inconvertible_starts_var.hap" @@ -168,7 +168,7 @@ def test_with_inconvertible_starts(): ) -def test_with_inconvertible_ends(): +def test_with_inconvertible_ends_var(): assert ( val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_inconvertible_ends_var.hap") == False From 85a298bacf0b78d5a0c4b6f8092a6fd47e539c7a Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 27 Jul 2023 07:51:52 -0700 Subject: [PATCH 16/44] fix pgenlib import issue --- tests/test_val_hapfile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_val_hapfile.py b/tests/test_val_hapfile.py index dd4e1dba..a0ae2d0c 100644 --- a/tests/test_val_hapfile.py +++ b/tests/test_val_hapfile.py @@ -273,6 +273,7 @@ def test_with_variant_inexistent_haplotype_id(): def test_with_missing_variant_in_pvar(): + pgenlib = pytest.importorskip("pgenlib") assert ( val_hapfile.is_hapfile_valid( DATADIR / "simple.hap", pgen=DATADIR / "basic_missing_ids.pvar" From 777114e948798c5ebc2a757d0aad56e8cc89a1d3 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Thu, 27 Jul 2023 14:45:43 -0600 Subject: [PATCH 17/44] Add doc base for the valhap command --- docs/commands/val_hapfile.rst | 108 ++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 109 insertions(+) create mode 100644 docs/commands/val_hapfile.rst diff --git a/docs/commands/val_hapfile.rst b/docs/commands/val_hapfile.rst new file mode 100644 index 00000000..151db304 --- /dev/null +++ b/docs/commands/val_hapfile.rst @@ -0,0 +1,108 @@ +.. _commands-valhap: + + +validate-hapfile +===== + +Validate the structure of a ``.hap`` file. + +When a ``.hap`` file contains any errors, they will be logged accordingly. + +Optionally, the haplotypes present in the ``.hap`` file can be compared against a ``.pgen`` file. + +Usage +~~~~~ +.. code-block:: bash + + haptools validate-hapfile \ + --sort \ + --genotypes PATH \ + --verbosity [CRITICAL|ERROR|WARNING|INFO|DEBUG|NOTSET] \ + HAPFILE + +Examples +~~~~~~~~ +.. code-block:: bash + + haptools index tests/data/hapfiles/basic.hap + +Outputs a message specifying the amount of errors and warnings + +.. code-block:: + [ INFO] Completed HapFile validation with 0 errors and 0 warnings. (val_hapfile.py:876) + +All warnings and errors will be logged if there are any + +.. code-block:: bash + + haptools validate-hapfile tests/data/hapfiles/valhap_with_no_version.hap + +.. code-block:: + [ WARNING] No version declaration found. Assuming to use the latest version. (val_hapfile.py:199) + [ INFO] Completed HapFile validation with 0 errors and 1 warnings. (val_hapfile.py:876) + [ WARNING] Found several warnings and / or errors in the hapfile (__main__.py:1071) + +One can use ``--no-sort`` to avoid sorting the file. +This will make it so that all unordered files will get removed, such as out-of-header lines with meta information + +.. code-block:: bash + + haptools validate-hapfile --no-sort tests/data/hapfiles/valhap_with_out_of_header_metas.hap + +Will turn: + +.. code-block:: + # orderH ancestry beta + # version 0.2.0 + #H ancestry s Local ancestry + #H beta .2f Effect size in linear model + #R beta .2f Effect size in linear model + H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 + R 21 26938353 26938400 21_26938353_STR 0.45 + H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 + H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 + # This should cause an error if the file is sorted + #V test_field s A field to test with + V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C + V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T + V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C + V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G + V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A + V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T + V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A + V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T + V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A + +Into + + .. code-block:: + # orderH ancestry beta + # version 0.2.0 + #H ancestry s Local ancestry + #H beta .2f Effect size in linear model + #R beta .2f Effect size in linear model + H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 + R 21 26938353 26938400 21_26938353_STR 0.45 + H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 + H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 + V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C + V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T + V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C + V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G + V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A + V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T + V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A + V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T + V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A + +.. warning:: + If the previous example were to be sorted then there would be several errors in the ``.hap`` file. + All sorted files parse the meta information lines first, thus the ``H`` and ``R`` lines would be incomplete + +Detailed Usage +~~~~~~~~~~~~~~ + +.. click:: haptools.__main__:main + :prog: haptools + :show-nested: + :commands: validate_hapfile diff --git a/docs/index.rst b/docs/index.rst index 0d957c62..e045f5d4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -98,6 +98,7 @@ There is an option to *Cite this repository* on the right sidebar of `the reposi commands/index.rst commands/clump.rst commands/ld.rst + commands/val_hapfile.rst .. toctree:: :caption: API From 747b43bb418a873c29cfa7ac00480d5d5eb30113 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Thu, 27 Jul 2023 15:14:50 -0600 Subject: [PATCH 18/44] Clean up docs. Add further information. --- docs/commands/val_hapfile.rst | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/docs/commands/val_hapfile.rst b/docs/commands/val_hapfile.rst index 151db304..16509fb9 100644 --- a/docs/commands/val_hapfile.rst +++ b/docs/commands/val_hapfile.rst @@ -2,7 +2,7 @@ validate-hapfile -===== +================ Validate the structure of a ``.hap`` file. @@ -28,7 +28,7 @@ Examples Outputs a message specifying the amount of errors and warnings -.. code-block:: +.. code-block:: c [ INFO] Completed HapFile validation with 0 errors and 0 warnings. (val_hapfile.py:876) All warnings and errors will be logged if there are any @@ -37,7 +37,7 @@ All warnings and errors will be logged if there are any haptools validate-hapfile tests/data/hapfiles/valhap_with_no_version.hap -.. code-block:: +.. code-block:: c [ WARNING] No version declaration found. Assuming to use the latest version. (val_hapfile.py:199) [ INFO] Completed HapFile validation with 0 errors and 1 warnings. (val_hapfile.py:876) [ WARNING] Found several warnings and / or errors in the hapfile (__main__.py:1071) @@ -51,7 +51,7 @@ This will make it so that all unordered files will get removed, such as out-of-h Will turn: -.. code-block:: +.. code-block:: bash # orderH ancestry beta # version 0.2.0 #H ancestry s Local ancestry @@ -75,7 +75,7 @@ Will turn: Into - .. code-block:: +.. code-block:: bash # orderH ancestry beta # version 0.2.0 #H ancestry s Local ancestry @@ -99,6 +99,15 @@ Into If the previous example were to be sorted then there would be several errors in the ``.hap`` file. All sorted files parse the meta information lines first, thus the ``H`` and ``R`` lines would be incomplete +As mentioned before, one can use the ``--genotypes`` flag to provide a ``.pgen`` file with which to compare the existence of variant IDs +The following will check if all of the variant IDs in the ``.hap`` appear in the ``.pvar`` associated to the ``.pgen`` + +.. code-block:: bash + haptools validate-hapfile --genotypes tests/data/hapfiles/valhap_test_data.pgen tests/data/hapfiles/valhap_test_data.hap + +.. warning:: + You must generate a ``.pvar`` from your ``.pgen`` file. This is done in order to avoid reading heavy amounts of information which is not relevant to the validation process. + Detailed Usage ~~~~~~~~~~~~~~ From f28b902da00c30887ce19b446874d6616a4333b3 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Thu, 27 Jul 2023 15:17:41 -0600 Subject: [PATCH 19/44] Fix indentation --- docs/commands/val_hapfile.rst | 101 +++++++++++++++++----------------- 1 file changed, 49 insertions(+), 52 deletions(-) diff --git a/docs/commands/val_hapfile.rst b/docs/commands/val_hapfile.rst index 16509fb9..8727ae1b 100644 --- a/docs/commands/val_hapfile.rst +++ b/docs/commands/val_hapfile.rst @@ -13,7 +13,6 @@ Optionally, the haplotypes present in the ``.hap`` file can be compared against Usage ~~~~~ .. code-block:: bash - haptools validate-hapfile \ --sort \ --genotypes PATH \ @@ -29,89 +28,87 @@ Examples Outputs a message specifying the amount of errors and warnings .. code-block:: c - [ INFO] Completed HapFile validation with 0 errors and 0 warnings. (val_hapfile.py:876) + [ INFO] Completed HapFile validation with 0 errors and 0 warnings. (val_hapfile.py:876) All warnings and errors will be logged if there are any .. code-block:: bash - haptools validate-hapfile tests/data/hapfiles/valhap_with_no_version.hap .. code-block:: c - [ WARNING] No version declaration found. Assuming to use the latest version. (val_hapfile.py:199) - [ INFO] Completed HapFile validation with 0 errors and 1 warnings. (val_hapfile.py:876) - [ WARNING] Found several warnings and / or errors in the hapfile (__main__.py:1071) + [ WARNING] No version declaration found. Assuming to use the latest version. (val_hapfile.py:199) + [ INFO] Completed HapFile validation with 0 errors and 1 warnings. (val_hapfile.py:876) + [ WARNING] Found several warnings and / or errors in the hapfile (__main__.py:1071) One can use ``--no-sort`` to avoid sorting the file. This will make it so that all unordered files will get removed, such as out-of-header lines with meta information .. code-block:: bash - haptools validate-hapfile --no-sort tests/data/hapfiles/valhap_with_out_of_header_metas.hap Will turn: .. code-block:: bash - # orderH ancestry beta - # version 0.2.0 - #H ancestry s Local ancestry - #H beta .2f Effect size in linear model - #R beta .2f Effect size in linear model - H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 - R 21 26938353 26938400 21_26938353_STR 0.45 - H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 - H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 - # This should cause an error if the file is sorted - #V test_field s A field to test with - V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C - V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T - V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C - V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G - V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A - V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T - V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A - V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T - V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A + # orderH ancestry beta + # version 0.2.0 + #H ancestry s Local ancestry + #H beta .2f Effect size in linear model + #R beta .2f Effect size in linear model + H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 + R 21 26938353 26938400 21_26938353_STR 0.45 + H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 + H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 + # This should cause an error if the file is sorted + #V test_field s A field to test with + V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C + V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T + V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C + V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G + V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A + V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T + V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A + V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T + V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A Into .. code-block:: bash - # orderH ancestry beta - # version 0.2.0 - #H ancestry s Local ancestry - #H beta .2f Effect size in linear model - #R beta .2f Effect size in linear model - H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 - R 21 26938353 26938400 21_26938353_STR 0.45 - H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 - H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 - V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C - V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T - V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C - V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G - V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A - V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T - V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A - V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T - V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A + # orderH ancestry beta + # version 0.2.0 + #H ancestry s Local ancestry + #H beta .2f Effect size in linear model + #R beta .2f Effect size in linear model + H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 + R 21 26938353 26938400 21_26938353_STR 0.45 + H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 + H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 + V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C + V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T + V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C + V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G + V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A + V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T + V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A + V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T + V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A .. warning:: - If the previous example were to be sorted then there would be several errors in the ``.hap`` file. - All sorted files parse the meta information lines first, thus the ``H`` and ``R`` lines would be incomplete + If the previous example were to be sorted then there would be several errors in the ``.hap`` file. + All sorted files parse the meta information lines first, thus the ``H`` and ``R`` lines would be incomplete As mentioned before, one can use the ``--genotypes`` flag to provide a ``.pgen`` file with which to compare the existence of variant IDs The following will check if all of the variant IDs in the ``.hap`` appear in the ``.pvar`` associated to the ``.pgen`` .. code-block:: bash - haptools validate-hapfile --genotypes tests/data/hapfiles/valhap_test_data.pgen tests/data/hapfiles/valhap_test_data.hap + haptools validate-hapfile --genotypes tests/data/hapfiles/valhap_test_data.pgen tests/data/hapfiles/valhap_test_data.hap .. warning:: - You must generate a ``.pvar`` from your ``.pgen`` file. This is done in order to avoid reading heavy amounts of information which is not relevant to the validation process. + You must generate a ``.pvar`` from your ``.pgen`` file. This is done in order to avoid reading heavy amounts of information which is not relevant to the validation process. Detailed Usage ~~~~~~~~~~~~~~ .. click:: haptools.__main__:main - :prog: haptools - :show-nested: - :commands: validate_hapfile + :prog: haptools + :show-nested: + :commands: validate_hapfile From dbe6d87a8b3371358f8ebfb30c1c357569e5d3e4 Mon Sep 17 00:00:00 2001 From: Ayimany Date: Thu, 27 Jul 2023 15:45:47 -0600 Subject: [PATCH 20/44] Fix format. --- docs/commands/val_hapfile.rst | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/docs/commands/val_hapfile.rst b/docs/commands/val_hapfile.rst index 8727ae1b..e2026937 100644 --- a/docs/commands/val_hapfile.rst +++ b/docs/commands/val_hapfile.rst @@ -13,6 +13,7 @@ Optionally, the haplotypes present in the ``.hap`` file can be compared against Usage ~~~~~ .. code-block:: bash + haptools validate-hapfile \ --sort \ --genotypes PATH \ @@ -25,30 +26,35 @@ Examples haptools index tests/data/hapfiles/basic.hap -Outputs a message specifying the amount of errors and warnings +Outputs a message specifying the amount of errors and warnings. + +.. code-block:: -.. code-block:: c [ INFO] Completed HapFile validation with 0 errors and 0 warnings. (val_hapfile.py:876) All warnings and errors will be logged if there are any .. code-block:: bash + haptools validate-hapfile tests/data/hapfiles/valhap_with_no_version.hap -.. code-block:: c +.. code-block:: + [ WARNING] No version declaration found. Assuming to use the latest version. (val_hapfile.py:199) [ INFO] Completed HapFile validation with 0 errors and 1 warnings. (val_hapfile.py:876) [ WARNING] Found several warnings and / or errors in the hapfile (__main__.py:1071) One can use ``--no-sort`` to avoid sorting the file. -This will make it so that all unordered files will get removed, such as out-of-header lines with meta information +This will make it so that all unordered files will get removed, such as out-of-header lines with meta information. .. code-block:: bash + haptools validate-hapfile --no-sort tests/data/hapfiles/valhap_with_out_of_header_metas.hap Will turn: -.. code-block:: bash +.. code-block:: + # orderH ancestry beta # version 0.2.0 #H ancestry s Local ancestry @@ -72,7 +78,8 @@ Will turn: Into -.. code-block:: bash +.. code-block:: + # orderH ancestry beta # version 0.2.0 #H ancestry s Local ancestry @@ -92,18 +99,22 @@ Into V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A -.. warning:: - If the previous example were to be sorted then there would be several errors in the ``.hap`` file. - All sorted files parse the meta information lines first, thus the ``H`` and ``R`` lines would be incomplete -As mentioned before, one can use the ``--genotypes`` flag to provide a ``.pgen`` file with which to compare the existence of variant IDs -The following will check if all of the variant IDs in the ``.hap`` appear in the ``.pvar`` associated to the ``.pgen`` +If the previous example were to be sorted then there would be several errors in the ``.hap`` file. +All sorted files parse the meta information lines first, thus the ``V`` lines would be incomplete. + +As mentioned before, one can use the ``--genotypes`` flag to provide a ``.pgen`` file with which to compare the existence of variant IDs. +The following will check if all of the variant IDs in the ``.hap`` appear in the ``.pvar`` associated to the ``.pgen``. .. code-block:: bash + haptools validate-hapfile --genotypes tests/data/hapfiles/valhap_test_data.pgen tests/data/hapfiles/valhap_test_data.hap .. warning:: - You must generate a ``.pvar`` from your ``.pgen`` file. This is done in order to avoid reading heavy amounts of information which is not relevant to the validation process. + + You must generate a ``.pvar`` from your ``.pgen`` file. + This is done in order to avoid reading heavy amounts of + information which is not relevant to the validation process. Detailed Usage ~~~~~~~~~~~~~~ From 32468f90c71dd5c13204690ddcb94f370543aaba Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 30 Jul 2023 10:49:17 -0700 Subject: [PATCH 21/44] rename from val_hapfile to to 'validate' since we may want the validate command to validate other kinds of files besides hap files in the future --- .../{val_hapfile.rst => validate.rst} | 28 +++++++++---------- docs/index.rst | 4 ++- haptools/__main__.py | 10 +++---- haptools/{val_hapfile.py => validate.py} | 4 +-- .../{test_val_hapfile.py => test_validate.py} | 4 +-- 5 files changed, 23 insertions(+), 27 deletions(-) rename docs/commands/{val_hapfile.rst => validate.rst} (85%) rename haptools/{val_hapfile.py => validate.py} (99%) rename tests/{test_val_hapfile.py => test_validate.py} (98%) diff --git a/docs/commands/val_hapfile.rst b/docs/commands/validate.rst similarity index 85% rename from docs/commands/val_hapfile.rst rename to docs/commands/validate.rst index e2026937..c4a48a9c 100644 --- a/docs/commands/val_hapfile.rst +++ b/docs/commands/validate.rst @@ -1,8 +1,8 @@ -.. _commands-valhap: +.. _commands-validate: -validate-hapfile -================ +validate +======== Validate the structure of a ``.hap`` file. @@ -14,7 +14,7 @@ Usage ~~~~~ .. code-block:: bash - haptools validate-hapfile \ + haptools validate \ --sort \ --genotypes PATH \ --verbosity [CRITICAL|ERROR|WARNING|INFO|DEBUG|NOTSET] \ @@ -24,32 +24,32 @@ Examples ~~~~~~~~ .. code-block:: bash - haptools index tests/data/hapfiles/basic.hap + haptools validate tests/data/hapfiles/basic.hap Outputs a message specifying the amount of errors and warnings. .. code-block:: - [ INFO] Completed HapFile validation with 0 errors and 0 warnings. (val_hapfile.py:876) + [ INFO] Completed HapFile validation with 0 errors and 0 warnings. -All warnings and errors will be logged if there are any +All warnings and errors will be logged if there are any. .. code-block:: bash - haptools validate-hapfile tests/data/hapfiles/valhap_with_no_version.hap + haptools validate tests/data/hapfiles/valhap_with_no_version.hap .. code-block:: - [ WARNING] No version declaration found. Assuming to use the latest version. (val_hapfile.py:199) - [ INFO] Completed HapFile validation with 0 errors and 1 warnings. (val_hapfile.py:876) - [ WARNING] Found several warnings and / or errors in the hapfile (__main__.py:1071) + [ WARNING] No version declaration found. Assuming to use the latest version. + [ INFO] Completed HapFile validation with 0 errors and 1 warnings. + [ WARNING] Found several warnings and / or errors in the hapfile One can use ``--no-sort`` to avoid sorting the file. This will make it so that all unordered files will get removed, such as out-of-header lines with meta information. .. code-block:: bash - haptools validate-hapfile --no-sort tests/data/hapfiles/valhap_with_out_of_header_metas.hap + haptools validate --no-sort tests/data/hapfiles/valhap_with_out_of_header_metas.hap Will turn: @@ -108,7 +108,7 @@ The following will check if all of the variant IDs in the ``.hap`` appear in the .. code-block:: bash - haptools validate-hapfile --genotypes tests/data/hapfiles/valhap_test_data.pgen tests/data/hapfiles/valhap_test_data.hap + haptools validate --genotypes tests/data/hapfiles/valhap_test_data.pgen tests/data/hapfiles/valhap_test_data.hap .. warning:: @@ -122,4 +122,4 @@ Detailed Usage .. click:: haptools.__main__:main :prog: haptools :show-nested: - :commands: validate_hapfile + :commands: validate diff --git a/docs/index.rst b/docs/index.rst index e045f5d4..7e02fd34 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -20,6 +20,8 @@ Commands * :doc:`haptools transform `: Transform a set of genotypes via a list of haplotypes. Create a new VCF containing haplotypes instead of variants. +* :doc:`haptools validate `: Validate the formatting of a haplotype file. + * :doc:`haptools index `: Sort, compress, and index our custom file format for haplotypes. * :doc:`haptools clump `: Convert variants in LD with one another into clumps. @@ -95,10 +97,10 @@ There is an option to *Cite this repository* on the right sidebar of `the reposi commands/simphenotype.rst commands/karyogram.rst commands/transform.rst + commands/validate.rst commands/index.rst commands/clump.rst commands/ld.rst - commands/val_hapfile.rst .. toctree:: :caption: API diff --git a/haptools/__main__.py b/haptools/__main__.py index 850df37b..1bc4ae44 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1052,20 +1052,18 @@ def clump( show_default=True, help="The level of verbosity desired", ) -def validate_hapfile( +def validate( filename: Path, sort: bool, genotypes: Path | None = None, verbosity: str = "DEBUG", ): - from haptools import val_hapfile from .logging import getLogger + from .validate import is_hapfile_valid - log = getLogger(name="validate-hapfile", level=verbosity) + log = getLogger(name="validate", level=verbosity) - is_valid = val_hapfile.is_hapfile_valid( - filename, sorted=sort, logger=log, pgen=genotypes - ) + is_valid = is_hapfile_valid(filename, sorted=sort, logger=log, pgen=genotypes) if not is_valid: log.warn("Found several warnings and / or errors in the hapfile") diff --git a/haptools/val_hapfile.py b/haptools/validate.py similarity index 99% rename from haptools/val_hapfile.py rename to haptools/validate.py index 441ef734..349e147a 100644 --- a/haptools/val_hapfile.py +++ b/haptools/validate.py @@ -6,13 +6,11 @@ from re import search from pathlib import Path -from pysam import VariantFile - from .logging import logging from .data import GenotypesPLINK -LOGGER_NAME = "valhap" +LOGGER_NAME = "validate" LTS_SPEC = "0.2.0" diff --git a/tests/test_val_hapfile.py b/tests/test_validate.py similarity index 98% rename from tests/test_val_hapfile.py rename to tests/test_validate.py index a0ae2d0c..e6a78d0a 100644 --- a/tests/test_val_hapfile.py +++ b/tests/test_validate.py @@ -1,11 +1,9 @@ -import os from pathlib import Path import pytest from . import test_data -from haptools import val_hapfile -from haptools import data +from haptools import validate as val_hapfile DATADIR = Path(__file__).parent.joinpath("data").joinpath("hapfiles") From 390eaeb74c81d1fd688902c203720d782ffdb667 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 14 Sep 2023 13:33:15 -0700 Subject: [PATCH 22/44] implement some suggestions from PR --- docs/commands/validate.rst | 17 +++++++++-------- haptools/__main__.py | 1 - 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/commands/validate.rst b/docs/commands/validate.rst index c4a48a9c..12a6f6ff 100644 --- a/docs/commands/validate.rst +++ b/docs/commands/validate.rst @@ -8,7 +8,7 @@ Validate the structure of a ``.hap`` file. When a ``.hap`` file contains any errors, they will be logged accordingly. -Optionally, the haplotypes present in the ``.hap`` file can be compared against a ``.pgen`` file. +Optionally, the SNPs and TRs present in the ``.hap`` file can be compared against a ``.pgen`` file. Usage ~~~~~ @@ -103,18 +103,19 @@ Into If the previous example were to be sorted then there would be several errors in the ``.hap`` file. All sorted files parse the meta information lines first, thus the ``V`` lines would be incomplete. -As mentioned before, one can use the ``--genotypes`` flag to provide a ``.pgen`` file with which to compare the existence of variant IDs. -The following will check if all of the variant IDs in the ``.hap`` appear in the ``.pvar`` associated to the ``.pgen``. +As mentioned before, one can use the ``--genotypes`` flag to provide a ``.pvar`` file with which to compare the existence of variant IDs. +The following will check if all of the variant IDs in the ``.hap`` appear in the ``.pvar`` file. .. code-block:: bash - haptools validate --genotypes tests/data/hapfiles/valhap_test_data.pgen tests/data/hapfiles/valhap_test_data.hap + haptools validate --genotypes tests/data/hapfiles/valhap_test_data.pvar tests/data/hapfiles/valhap_test_data.hap -.. warning:: +.. note:: - You must generate a ``.pvar`` from your ``.pgen`` file. - This is done in order to avoid reading heavy amounts of - information which is not relevant to the validation process. + We accept a PVAR file instead of a VCF in order to avoid reading lots of + information which is not relevant to the validation process. However, any + VCF wihtout a FORMAT field is a valid PVAR file. So you can easily create a PVAR file + using the ``cut`` command or ``plink2 --make-just-pvar``. Detailed Usage ~~~~~~~~~~~~~~ diff --git a/haptools/__main__.py b/haptools/__main__.py index 1bc4ae44..37f0fc07 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1,7 +1,6 @@ #!/usr/bin/env python from __future__ import annotations -from enum import Flag import sys from pathlib import Path From 8b324ace689605ae01b3695306eb15b4046a01f4 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Thu, 14 Sep 2023 16:57:36 -0400 Subject: [PATCH 23/44] Use relative import for logging module --- haptools/validate.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/haptools/validate.py b/haptools/validate.py index 349e147a..eea4ff58 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -1,12 +1,12 @@ from __future__ import annotations import os -from haptools import logging +import logging from re import search from pathlib import Path -from .logging import logging +from .logging import getLogger from .data import GenotypesPLINK @@ -36,7 +36,7 @@ def __str__(self) -> str: class HapFileIO: def __init__(self, filename: Path, logger=None): self.filename = filename - self.log = logger or logging.getLogger(LOGGER_NAME) + self.log = logger or logging.getLogger(self.__class__.__name__) def lines(self, sorted: bool = True) -> list[Line]: buffer = open(self.filename) @@ -124,7 +124,7 @@ class HapFileValidator: KEY_ALLELE: str = "HT::Allele" def __init__(self, logger=None): - self.log = logger or logging.getLogger(LOGGER_NAME) + self.log = logger or logging.getLogger(self.__class__.__name__) self.vars_ex: dict[int, dict[str, type]] = { HapFileValidator.KEY_HAPLOTYPE: {}, @@ -833,12 +833,11 @@ def is_hapfile_valid( sorted: bool = True, pgen: Path | None = None, max_variants: int = 10000, - logger=None, + log: logging.Logger = None, ) -> bool: - log = logger if log == None: - log = logging.getLogger(LOGGER_NAME) + log = getLogger(LOGGER_NAME) file = HapFileIO(filename, logger=log) From 57c81f84184d547303c635c5eb7862bc42b4ccdc Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 15 Sep 2023 14:40:57 -0700 Subject: [PATCH 24/44] accept pvar instead of pgen --- docs/commands/validate.rst | 2 +- haptools/validate.py | 7 +++---- tests/test_validate.py | 6 +++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/commands/validate.rst b/docs/commands/validate.rst index 12a6f6ff..fa39e746 100644 --- a/docs/commands/validate.rst +++ b/docs/commands/validate.rst @@ -8,7 +8,7 @@ Validate the structure of a ``.hap`` file. When a ``.hap`` file contains any errors, they will be logged accordingly. -Optionally, the SNPs and TRs present in the ``.hap`` file can be compared against a ``.pgen`` file. +If provided, the SNPs and TRs present in the ``.hap`` file will be confirmed to exist in a ``.pvar`` file. Usage ~~~~~ diff --git a/haptools/validate.py b/haptools/validate.py index eea4ff58..6feec7bf 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -831,11 +831,10 @@ def warnskip(self, line: Line): def is_hapfile_valid( filename: Path, sorted: bool = True, - pgen: Path | None = None, + pvar: Path | None = None, max_variants: int = 10000, log: logging.Logger = None, ) -> bool: - if log == None: log = getLogger(LOGGER_NAME) @@ -862,8 +861,8 @@ def is_hapfile_valid( hapfile.validate_version_declarations() - if pgen != None: - varfile = GenotypesPLINK(pgen) + if pvar is not None: + varfile = GenotypesPLINK(pvar.with_suffix(".pgen")) varfile.read_variants(max_variants=max_variants) ids = list(map(lambda v: v[0], varfile.variants)) diff --git a/tests/test_validate.py b/tests/test_validate.py index e6a78d0a..28c2fc2b 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -5,7 +5,7 @@ from . import test_data from haptools import validate as val_hapfile -DATADIR = Path(__file__).parent.joinpath("data").joinpath("hapfiles") +DATADIR = Path(__file__).parent.joinpath("data") / "hapfiles" def _generate_fake_haps(): @@ -28,7 +28,7 @@ def test_generated_haplotypes(): assert ( val_hapfile.is_hapfile_valid( - DATADIR / "valhap_test_data.hap", pgen=DATADIR / "valhap_test_data.pvar" + DATADIR / "valhap_test_data.hap", pvar=DATADIR / "valhap_test_data.pvar" ) == True ) @@ -274,7 +274,7 @@ def test_with_missing_variant_in_pvar(): pgenlib = pytest.importorskip("pgenlib") assert ( val_hapfile.is_hapfile_valid( - DATADIR / "simple.hap", pgen=DATADIR / "basic_missing_ids.pvar" + DATADIR / "simple.hap", pvar=DATADIR / "basic_missing_ids.pvar" ) == False ) From 61ac08c1b84960a4ff40d5383a6361ccec09ea73 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Sep 2023 11:14:10 -0700 Subject: [PATCH 25/44] change up logging to be silent by default when called from command line --- haptools/__main__.py | 4 ++-- haptools/validate.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/haptools/__main__.py b/haptools/__main__.py index 37f0fc07..5f4c160a 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1055,14 +1055,14 @@ def validate( filename: Path, sort: bool, genotypes: Path | None = None, - verbosity: str = "DEBUG", + verbosity: str = "INFO", ): from .logging import getLogger from .validate import is_hapfile_valid log = getLogger(name="validate", level=verbosity) - is_valid = is_hapfile_valid(filename, sorted=sort, logger=log, pgen=genotypes) + is_valid = is_hapfile_valid(filename, sorted=sort, log=log, pgen=genotypes) if not is_valid: log.warn("Found several warnings and / or errors in the hapfile") diff --git a/haptools/validate.py b/haptools/validate.py index 6feec7bf..d682e18c 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -11,7 +11,6 @@ LOGGER_NAME = "validate" -LTS_SPEC = "0.2.0" def tmpex(expectation: object, received: object) -> str: @@ -836,7 +835,7 @@ def is_hapfile_valid( log: logging.Logger = None, ) -> bool: if log == None: - log = getLogger(LOGGER_NAME) + log = getLogger(name=LOGGER_NAME, level="CRITICAL") file = HapFileIO(filename, logger=log) From c4ecaecf61af9d7460e0edb62d147238b8f919f8 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 16 Sep 2023 11:15:04 -0700 Subject: [PATCH 26/44] reformat test_validate.py for concision --- tests/data/hapfiles/valhap_test_data.hap | 10 - tests/data/hapfiles/valhap_test_data.pvar | 8 - tests/test_validate.py | 211 +++++++--------------- 3 files changed, 64 insertions(+), 165 deletions(-) delete mode 100644 tests/data/hapfiles/valhap_test_data.hap delete mode 100644 tests/data/hapfiles/valhap_test_data.pvar diff --git a/tests/data/hapfiles/valhap_test_data.hap b/tests/data/hapfiles/valhap_test_data.hap deleted file mode 100644 index 2ecefe21..00000000 --- a/tests/data/hapfiles/valhap_test_data.hap +++ /dev/null @@ -1,10 +0,0 @@ -# version 0.2.0 -H 1 10114 10118 H1 -H 1 10114 10119 H2 -H 1 10116 10119 H3 -V H1 10114 10115 1:10114:T:C T -V H1 10116 10117 1:10116:A:G G -V H2 10114 10115 1:10114:T:C C -V H2 10117 10118 1:10117:C:A C -V H3 10116 10117 1:10116:A:G A -V H3 10117 10118 1:10117:C:A A diff --git a/tests/data/hapfiles/valhap_test_data.pvar b/tests/data/hapfiles/valhap_test_data.pvar deleted file mode 100644 index ca962b0b..00000000 --- a/tests/data/hapfiles/valhap_test_data.pvar +++ /dev/null @@ -1,8 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##contig= -#CHROM POS ID REF ALT QUAL FILTER INFO -1 10114 1:10114:T:C T C . . . -1 10116 1:10116:A:G A G . . . -1 10117 1:10117:C:A C A . . . -1 10122 1:10122:A:G A G . . . diff --git a/tests/test_validate.py b/tests/test_validate.py index 28c2fc2b..598a9ec0 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -8,277 +8,194 @@ DATADIR = Path(__file__).parent.joinpath("data") / "hapfiles" -def _generate_fake_haps(): - haps_ = test_data.TestHaplotypes() - haps = haps_._get_dummy_haps() - haps.fname = Path(DATADIR / "valhap_test_data.hap") - haps.write() - - -def _generate_fake_vars(): - vars_ = test_data.TestGenotypesPLINK() - vars = vars_._get_fake_genotypes_plink() - vars.fname = Path(DATADIR / "valhap_test_data.plink") - vars.write_variants() - - def test_generated_haplotypes(): - _generate_fake_haps() - _generate_fake_vars() - - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_test_data.hap", pvar=DATADIR / "valhap_test_data.pvar" - ) - == True - ) + datadir = Path(__file__).parent.joinpath("data") + hapfile = Path(datadir / "simple.hap") + pvarfile = Path(datadir / "simple.pvar") + + assert val_hapfile.is_hapfile_valid(hapfile, pvar=pvarfile) def test_with_empty_lines(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_empty_lines.hap", - ) - == True - ) + assert val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_empty_lines.hap") def test_with_out_of_header_metas_sorted(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_out_of_header_metas.hap", sorted=True - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_out_of_header_metas.hap", sorted=True ) def test_with_out_of_header_metas_unsorted(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_out_of_header_metas.hap", sorted=False - ) - == True + assert val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_out_of_header_metas.hap", sorted=False ) def test_with_10_extras_reordered(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_10_extras_reordered.hap") - == True - ) + assert val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_10_extras_reordered.hap") def test_with_unexistent_reorders(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_unexistent_reorders.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_unexistent_reorders.hap" ) def test_with_unexistent_fields(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_unexistent_fields.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_unexistent_fields.hap" ) def test_with_inadequate_version(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_inadequate_version.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_inadequate_version.hap" ) def test_with_no_version(): - assert val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_no_version.hap") == False + assert not val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_no_version.hap") def test_with_multiple_versions(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_multiple_versions.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_multiple_versions.hap" ) def test_with_inadequate_version_columns(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_inadequate_version_columns.hap" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_inadequate_version_columns.hap" ) def test_with_invalid_column_addition_column_count(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_invalid_column_addition_column_count.hap" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_invalid_column_addition_column_count.hap" ) def test_with_invalid_column_addition_types(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_invalid_column_addition_types.hap" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_invalid_column_addition_types.hap" ) def test_with_invalid_column_addition_data_types(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_invalid_column_addition_data_types.hap" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_invalid_column_addition_data_types.hap" ) def test_with_insufficient_columns(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_insufficient_columns.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_insufficient_columns.hap" ) def test_with_inconvertible_starts(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_inconvertible_starts.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_inconvertible_starts.hap" ) def test_with_inconvertible_ends(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_inconvertible_ends.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_inconvertible_ends.hap" ) def test_with_inconvertible_starts_var(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_inconvertible_starts_var.hap" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_inconvertible_starts_var.hap" ) def test_with_inconvertible_ends_var(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_inconvertible_ends_var.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_inconvertible_ends_var.hap" ) def test_valhap_with_start_after_end(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_start_after_end.hap") - == False - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_start_after_end.hap") def test_is_directory(): - assert val_hapfile.is_hapfile_valid(DATADIR / "valhap_is_directory.hap") == False + assert not val_hapfile.is_hapfile_valid(DATADIR / "valhap_is_directory.hap") def test_with_variant_id_of_chromosome(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_variant_id_of_chromosome.hap" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_variant_id_of_chromosome.hap" ) def test_with_hrid_of_chromosome(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_hrid_of_chromosome.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_hrid_of_chromosome.hap" ) def test_with_unexistent_col_in_order(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_unexistent_col_in_order.hap" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_unexistent_col_in_order.hap" ) def test_with_unassociated_haplotype(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_unassociated_haplotype.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_unassociated_haplotype.hap" ) def test_with_unrecognizable_allele(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_unrecognizable_allele.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_unrecognizable_allele.hap" ) def test_with_duplicate_ids(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_duplicate_ids.hap") == False - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_duplicate_ids.hap") def test_with_duplicate_vids_per_haplotype(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_duplicate_vids_per_haplotype.hap" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_duplicate_vids_per_haplotype.hap" ) def test_with_excol_of_wrong_type(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_excol_of_wrong_type.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_excol_of_wrong_type.hap" ) def test_with_multiple_order_defs(): - assert ( - val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_multiple_order_defs.hap") - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_multiple_order_defs.hap" ) def test_with_insufficient_excols_in_reorder(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_insufficient_excols_in_reorder.hap" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_insufficient_excols_in_reorder.hap" ) def test_with_variant_inexistent_haplotype_id(): - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_variant_inexistent_haplotype_id.hap" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "valhap_with_variant_inexistent_haplotype_id.hap" ) def test_with_missing_variant_in_pvar(): pgenlib = pytest.importorskip("pgenlib") - assert ( - val_hapfile.is_hapfile_valid( - DATADIR / "simple.hap", pvar=DATADIR / "basic_missing_ids.pvar" - ) - == False + assert not val_hapfile.is_hapfile_valid( + DATADIR / "simple.hap", pvar=DATADIR / "basic_missing_ids.pvar" ) def test_unreadable_hapfile(): - assert val_hapfile.is_hapfile_valid(Path("NON_EXISTENT_FILENAME.hap")) == False + assert not val_hapfile.is_hapfile_valid(Path("NON_EXISTENT_FILENAME.hap")) From 6bbee4b7b259359f54a1fe32c2857a0700f2a01b Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 17 Sep 2023 11:00:12 -0700 Subject: [PATCH 27/44] rename test data dir and remove valhap prefix --- docs/commands/validate.rst | 8 +- .../10_extras_reordered.hap} | 0 tests/data/{hapfiles => valhap}/basic.hap | 0 tests/data/{hapfiles => valhap}/basic.pvar | 0 .../basic_missing_ids.pvar | 0 .../valhap_correct.hap => valhap/correct.hap} | 0 .../duplicate_ids.hap} | 0 .../duplicate_vids_per_haplotype.hap} | 0 .../empty_lines.hap} | 0 .../excol_of_wrong_type.hap} | 0 .../hrid_of_chromosome.hap} | 0 .../inadequate_version.hap} | 0 .../inadequate_version_columns.hap} | 0 .../inconvertible_ends.hap} | 0 .../inconvertible_ends_var.hap} | 0 .../inconvertible_starts.hap} | 0 .../inconvertible_starts_var.hap} | 0 .../insufficient_columns.hap} | 0 .../insufficient_excols_in_reorder.hap} | 0 .../invalid_column_addition_column_count.hap} | 0 .../invalid_column_addition_data_types.hap} | 0 .../invalid_column_addition_types.hap} | 0 .../is_directory.hap}/keep.git | 0 .../multiple_order_defs.hap} | 0 .../multiple_versions.hap} | 0 .../no_version.hap} | 0 .../out_of_header_metas.hap} | 0 tests/data/{hapfiles => valhap}/simple.hap | 0 tests/data/{hapfiles => valhap}/simple.pvar | 0 .../start_after_end.hap} | 0 .../unassociated_haplotype.hap} | 0 .../unexistent_col_in_order.hap} | 0 .../unexistent_fields.hap} | 0 .../unexistent_reorders.hap} | 0 .../unrecognizable_allele.hap} | 0 .../variant_id_of_chromosome.hap} | 0 .../variant_inexistent_haplotype_id.hap} | 0 tests/test_validate.py | 100 ++++++------------ 38 files changed, 37 insertions(+), 71 deletions(-) rename tests/data/{hapfiles/valhap_with_10_extras_reordered.hap => valhap/10_extras_reordered.hap} (100%) rename tests/data/{hapfiles => valhap}/basic.hap (100%) rename tests/data/{hapfiles => valhap}/basic.pvar (100%) rename tests/data/{hapfiles => valhap}/basic_missing_ids.pvar (100%) rename tests/data/{hapfiles/valhap_correct.hap => valhap/correct.hap} (100%) rename tests/data/{hapfiles/valhap_with_duplicate_ids.hap => valhap/duplicate_ids.hap} (100%) rename tests/data/{hapfiles/valhap_with_duplicate_vids_per_haplotype.hap => valhap/duplicate_vids_per_haplotype.hap} (100%) rename tests/data/{hapfiles/valhap_with_empty_lines.hap => valhap/empty_lines.hap} (100%) rename tests/data/{hapfiles/valhap_with_excol_of_wrong_type.hap => valhap/excol_of_wrong_type.hap} (100%) rename tests/data/{hapfiles/valhap_with_hrid_of_chromosome.hap => valhap/hrid_of_chromosome.hap} (100%) rename tests/data/{hapfiles/valhap_with_inadequate_version.hap => valhap/inadequate_version.hap} (100%) rename tests/data/{hapfiles/valhap_with_inadequate_version_columns.hap => valhap/inadequate_version_columns.hap} (100%) rename tests/data/{hapfiles/valhap_with_inconvertible_ends.hap => valhap/inconvertible_ends.hap} (100%) rename tests/data/{hapfiles/valhap_with_inconvertible_ends_var.hap => valhap/inconvertible_ends_var.hap} (100%) rename tests/data/{hapfiles/valhap_with_inconvertible_starts.hap => valhap/inconvertible_starts.hap} (100%) rename tests/data/{hapfiles/valhap_with_inconvertible_starts_var.hap => valhap/inconvertible_starts_var.hap} (100%) rename tests/data/{hapfiles/valhap_with_insufficient_columns.hap => valhap/insufficient_columns.hap} (100%) rename tests/data/{hapfiles/valhap_with_insufficient_excols_in_reorder.hap => valhap/insufficient_excols_in_reorder.hap} (100%) rename tests/data/{hapfiles/valhap_with_invalid_column_addition_column_count.hap => valhap/invalid_column_addition_column_count.hap} (100%) rename tests/data/{hapfiles/valhap_with_invalid_column_addition_data_types.hap => valhap/invalid_column_addition_data_types.hap} (100%) rename tests/data/{hapfiles/valhap_with_invalid_column_addition_types.hap => valhap/invalid_column_addition_types.hap} (100%) rename tests/data/{hapfiles/valhap_is_directory.hap => valhap/is_directory.hap}/keep.git (100%) rename tests/data/{hapfiles/valhap_with_multiple_order_defs.hap => valhap/multiple_order_defs.hap} (100%) rename tests/data/{hapfiles/valhap_with_multiple_versions.hap => valhap/multiple_versions.hap} (100%) rename tests/data/{hapfiles/valhap_with_no_version.hap => valhap/no_version.hap} (100%) rename tests/data/{hapfiles/valhap_with_out_of_header_metas.hap => valhap/out_of_header_metas.hap} (100%) rename tests/data/{hapfiles => valhap}/simple.hap (100%) rename tests/data/{hapfiles => valhap}/simple.pvar (100%) rename tests/data/{hapfiles/valhap_with_start_after_end.hap => valhap/start_after_end.hap} (100%) rename tests/data/{hapfiles/valhap_with_unassociated_haplotype.hap => valhap/unassociated_haplotype.hap} (100%) rename tests/data/{hapfiles/valhap_with_unexistent_col_in_order.hap => valhap/unexistent_col_in_order.hap} (100%) rename tests/data/{hapfiles/valhap_with_unexistent_fields.hap => valhap/unexistent_fields.hap} (100%) rename tests/data/{hapfiles/valhap_with_unexistent_reorders.hap => valhap/unexistent_reorders.hap} (100%) rename tests/data/{hapfiles/valhap_with_unrecognizable_allele.hap => valhap/unrecognizable_allele.hap} (100%) rename tests/data/{hapfiles/valhap_with_variant_id_of_chromosome.hap => valhap/variant_id_of_chromosome.hap} (100%) rename tests/data/{hapfiles/valhap_with_variant_inexistent_haplotype_id.hap => valhap/variant_inexistent_haplotype_id.hap} (100%) diff --git a/docs/commands/validate.rst b/docs/commands/validate.rst index fa39e746..99b520ea 100644 --- a/docs/commands/validate.rst +++ b/docs/commands/validate.rst @@ -24,7 +24,7 @@ Examples ~~~~~~~~ .. code-block:: bash - haptools validate tests/data/hapfiles/basic.hap + haptools validate tests/data/valhap/basic.hap Outputs a message specifying the amount of errors and warnings. @@ -36,7 +36,7 @@ All warnings and errors will be logged if there are any. .. code-block:: bash - haptools validate tests/data/hapfiles/valhap_with_no_version.hap + haptools validate tests/data/valhap/no_version.hap .. code-block:: @@ -49,7 +49,7 @@ This will make it so that all unordered files will get removed, such as out-of-h .. code-block:: bash - haptools validate --no-sort tests/data/hapfiles/valhap_with_out_of_header_metas.hap + haptools validate --no-sort tests/data/valhap/out_of_header_metas.hap Will turn: @@ -108,7 +108,7 @@ The following will check if all of the variant IDs in the ``.hap`` appear in the .. code-block:: bash - haptools validate --genotypes tests/data/hapfiles/valhap_test_data.pvar tests/data/hapfiles/valhap_test_data.hap + haptools validate --genotypes tests/data/valhap/test_data.pvar tests/data/valhap/test_data.hap .. note:: diff --git a/tests/data/hapfiles/valhap_with_10_extras_reordered.hap b/tests/data/valhap/10_extras_reordered.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_10_extras_reordered.hap rename to tests/data/valhap/10_extras_reordered.hap diff --git a/tests/data/hapfiles/basic.hap b/tests/data/valhap/basic.hap similarity index 100% rename from tests/data/hapfiles/basic.hap rename to tests/data/valhap/basic.hap diff --git a/tests/data/hapfiles/basic.pvar b/tests/data/valhap/basic.pvar similarity index 100% rename from tests/data/hapfiles/basic.pvar rename to tests/data/valhap/basic.pvar diff --git a/tests/data/hapfiles/basic_missing_ids.pvar b/tests/data/valhap/basic_missing_ids.pvar similarity index 100% rename from tests/data/hapfiles/basic_missing_ids.pvar rename to tests/data/valhap/basic_missing_ids.pvar diff --git a/tests/data/hapfiles/valhap_correct.hap b/tests/data/valhap/correct.hap similarity index 100% rename from tests/data/hapfiles/valhap_correct.hap rename to tests/data/valhap/correct.hap diff --git a/tests/data/hapfiles/valhap_with_duplicate_ids.hap b/tests/data/valhap/duplicate_ids.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_duplicate_ids.hap rename to tests/data/valhap/duplicate_ids.hap diff --git a/tests/data/hapfiles/valhap_with_duplicate_vids_per_haplotype.hap b/tests/data/valhap/duplicate_vids_per_haplotype.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_duplicate_vids_per_haplotype.hap rename to tests/data/valhap/duplicate_vids_per_haplotype.hap diff --git a/tests/data/hapfiles/valhap_with_empty_lines.hap b/tests/data/valhap/empty_lines.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_empty_lines.hap rename to tests/data/valhap/empty_lines.hap diff --git a/tests/data/hapfiles/valhap_with_excol_of_wrong_type.hap b/tests/data/valhap/excol_of_wrong_type.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_excol_of_wrong_type.hap rename to tests/data/valhap/excol_of_wrong_type.hap diff --git a/tests/data/hapfiles/valhap_with_hrid_of_chromosome.hap b/tests/data/valhap/hrid_of_chromosome.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_hrid_of_chromosome.hap rename to tests/data/valhap/hrid_of_chromosome.hap diff --git a/tests/data/hapfiles/valhap_with_inadequate_version.hap b/tests/data/valhap/inadequate_version.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_inadequate_version.hap rename to tests/data/valhap/inadequate_version.hap diff --git a/tests/data/hapfiles/valhap_with_inadequate_version_columns.hap b/tests/data/valhap/inadequate_version_columns.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_inadequate_version_columns.hap rename to tests/data/valhap/inadequate_version_columns.hap diff --git a/tests/data/hapfiles/valhap_with_inconvertible_ends.hap b/tests/data/valhap/inconvertible_ends.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_inconvertible_ends.hap rename to tests/data/valhap/inconvertible_ends.hap diff --git a/tests/data/hapfiles/valhap_with_inconvertible_ends_var.hap b/tests/data/valhap/inconvertible_ends_var.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_inconvertible_ends_var.hap rename to tests/data/valhap/inconvertible_ends_var.hap diff --git a/tests/data/hapfiles/valhap_with_inconvertible_starts.hap b/tests/data/valhap/inconvertible_starts.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_inconvertible_starts.hap rename to tests/data/valhap/inconvertible_starts.hap diff --git a/tests/data/hapfiles/valhap_with_inconvertible_starts_var.hap b/tests/data/valhap/inconvertible_starts_var.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_inconvertible_starts_var.hap rename to tests/data/valhap/inconvertible_starts_var.hap diff --git a/tests/data/hapfiles/valhap_with_insufficient_columns.hap b/tests/data/valhap/insufficient_columns.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_insufficient_columns.hap rename to tests/data/valhap/insufficient_columns.hap diff --git a/tests/data/hapfiles/valhap_with_insufficient_excols_in_reorder.hap b/tests/data/valhap/insufficient_excols_in_reorder.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_insufficient_excols_in_reorder.hap rename to tests/data/valhap/insufficient_excols_in_reorder.hap diff --git a/tests/data/hapfiles/valhap_with_invalid_column_addition_column_count.hap b/tests/data/valhap/invalid_column_addition_column_count.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_invalid_column_addition_column_count.hap rename to tests/data/valhap/invalid_column_addition_column_count.hap diff --git a/tests/data/hapfiles/valhap_with_invalid_column_addition_data_types.hap b/tests/data/valhap/invalid_column_addition_data_types.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_invalid_column_addition_data_types.hap rename to tests/data/valhap/invalid_column_addition_data_types.hap diff --git a/tests/data/hapfiles/valhap_with_invalid_column_addition_types.hap b/tests/data/valhap/invalid_column_addition_types.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_invalid_column_addition_types.hap rename to tests/data/valhap/invalid_column_addition_types.hap diff --git a/tests/data/hapfiles/valhap_is_directory.hap/keep.git b/tests/data/valhap/is_directory.hap/keep.git similarity index 100% rename from tests/data/hapfiles/valhap_is_directory.hap/keep.git rename to tests/data/valhap/is_directory.hap/keep.git diff --git a/tests/data/hapfiles/valhap_with_multiple_order_defs.hap b/tests/data/valhap/multiple_order_defs.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_multiple_order_defs.hap rename to tests/data/valhap/multiple_order_defs.hap diff --git a/tests/data/hapfiles/valhap_with_multiple_versions.hap b/tests/data/valhap/multiple_versions.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_multiple_versions.hap rename to tests/data/valhap/multiple_versions.hap diff --git a/tests/data/hapfiles/valhap_with_no_version.hap b/tests/data/valhap/no_version.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_no_version.hap rename to tests/data/valhap/no_version.hap diff --git a/tests/data/hapfiles/valhap_with_out_of_header_metas.hap b/tests/data/valhap/out_of_header_metas.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_out_of_header_metas.hap rename to tests/data/valhap/out_of_header_metas.hap diff --git a/tests/data/hapfiles/simple.hap b/tests/data/valhap/simple.hap similarity index 100% rename from tests/data/hapfiles/simple.hap rename to tests/data/valhap/simple.hap diff --git a/tests/data/hapfiles/simple.pvar b/tests/data/valhap/simple.pvar similarity index 100% rename from tests/data/hapfiles/simple.pvar rename to tests/data/valhap/simple.pvar diff --git a/tests/data/hapfiles/valhap_with_start_after_end.hap b/tests/data/valhap/start_after_end.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_start_after_end.hap rename to tests/data/valhap/start_after_end.hap diff --git a/tests/data/hapfiles/valhap_with_unassociated_haplotype.hap b/tests/data/valhap/unassociated_haplotype.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_unassociated_haplotype.hap rename to tests/data/valhap/unassociated_haplotype.hap diff --git a/tests/data/hapfiles/valhap_with_unexistent_col_in_order.hap b/tests/data/valhap/unexistent_col_in_order.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_unexistent_col_in_order.hap rename to tests/data/valhap/unexistent_col_in_order.hap diff --git a/tests/data/hapfiles/valhap_with_unexistent_fields.hap b/tests/data/valhap/unexistent_fields.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_unexistent_fields.hap rename to tests/data/valhap/unexistent_fields.hap diff --git a/tests/data/hapfiles/valhap_with_unexistent_reorders.hap b/tests/data/valhap/unexistent_reorders.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_unexistent_reorders.hap rename to tests/data/valhap/unexistent_reorders.hap diff --git a/tests/data/hapfiles/valhap_with_unrecognizable_allele.hap b/tests/data/valhap/unrecognizable_allele.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_unrecognizable_allele.hap rename to tests/data/valhap/unrecognizable_allele.hap diff --git a/tests/data/hapfiles/valhap_with_variant_id_of_chromosome.hap b/tests/data/valhap/variant_id_of_chromosome.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_variant_id_of_chromosome.hap rename to tests/data/valhap/variant_id_of_chromosome.hap diff --git a/tests/data/hapfiles/valhap_with_variant_inexistent_haplotype_id.hap b/tests/data/valhap/variant_inexistent_haplotype_id.hap similarity index 100% rename from tests/data/hapfiles/valhap_with_variant_inexistent_haplotype_id.hap rename to tests/data/valhap/variant_inexistent_haplotype_id.hap diff --git a/tests/test_validate.py b/tests/test_validate.py index 598a9ec0..ab4b9a7d 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -5,7 +5,7 @@ from . import test_data from haptools import validate as val_hapfile -DATADIR = Path(__file__).parent.joinpath("data") / "hapfiles" +DATADIR = Path(__file__).parent.joinpath("data") / "valhap" def test_generated_haplotypes(): @@ -17,176 +17,142 @@ def test_generated_haplotypes(): def test_with_empty_lines(): - assert val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_empty_lines.hap") + assert val_hapfile.is_hapfile_valid(DATADIR / "empty_lines.hap") def test_with_out_of_header_metas_sorted(): assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_out_of_header_metas.hap", sorted=True + DATADIR / "out_of_header_metas.hap", sorted=True ) def test_with_out_of_header_metas_unsorted(): assert val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_out_of_header_metas.hap", sorted=False + DATADIR / "out_of_header_metas.hap", sorted=False ) def test_with_10_extras_reordered(): - assert val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_10_extras_reordered.hap") + assert val_hapfile.is_hapfile_valid(DATADIR / "10_extras_reordered.hap") def test_with_unexistent_reorders(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_unexistent_reorders.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "unexistent_reorders.hap") def test_with_unexistent_fields(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_unexistent_fields.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "unexistent_fields.hap") def test_with_inadequate_version(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_inadequate_version.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "inadequate_version.hap") def test_with_no_version(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_no_version.hap") + assert not val_hapfile.is_hapfile_valid(DATADIR / "no_version.hap") def test_with_multiple_versions(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_multiple_versions.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "multiple_versions.hap") def test_with_inadequate_version_columns(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_inadequate_version_columns.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "inadequate_version_columns.hap") def test_with_invalid_column_addition_column_count(): assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_invalid_column_addition_column_count.hap" + DATADIR / "invalid_column_addition_column_count.hap" ) def test_with_invalid_column_addition_types(): assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_invalid_column_addition_types.hap" + DATADIR / "invalid_column_addition_types.hap" ) def test_with_invalid_column_addition_data_types(): assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_invalid_column_addition_data_types.hap" + DATADIR / "invalid_column_addition_data_types.hap" ) def test_with_insufficient_columns(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_insufficient_columns.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "insufficient_columns.hap") def test_with_inconvertible_starts(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_inconvertible_starts.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "inconvertible_starts.hap") def test_with_inconvertible_ends(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_inconvertible_ends.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "inconvertible_ends.hap") def test_with_inconvertible_starts_var(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_inconvertible_starts_var.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "inconvertible_starts_var.hap") def test_with_inconvertible_ends_var(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_inconvertible_ends_var.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "inconvertible_ends_var.hap") -def test_valhap_with_start_after_end(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_start_after_end.hap") +def test_start_after_end(): + assert not val_hapfile.is_hapfile_valid(DATADIR / "start_after_end.hap") def test_is_directory(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "valhap_is_directory.hap") + assert not val_hapfile.is_hapfile_valid(DATADIR / "is_directory.hap") def test_with_variant_id_of_chromosome(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_variant_id_of_chromosome.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "variant_id_of_chromosome.hap") def test_with_hrid_of_chromosome(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_hrid_of_chromosome.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "hrid_of_chromosome.hap") def test_with_unexistent_col_in_order(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_unexistent_col_in_order.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "unexistent_col_in_order.hap") def test_with_unassociated_haplotype(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_unassociated_haplotype.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "unassociated_haplotype.hap") def test_with_unrecognizable_allele(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_unrecognizable_allele.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "unrecognizable_allele.hap") def test_with_duplicate_ids(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "valhap_with_duplicate_ids.hap") + assert not val_hapfile.is_hapfile_valid(DATADIR / "duplicate_ids.hap") def test_with_duplicate_vids_per_haplotype(): assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_duplicate_vids_per_haplotype.hap" + DATADIR / "duplicate_vids_per_haplotype.hap" ) def test_with_excol_of_wrong_type(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_excol_of_wrong_type.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "excol_of_wrong_type.hap") def test_with_multiple_order_defs(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_multiple_order_defs.hap" - ) + assert not val_hapfile.is_hapfile_valid(DATADIR / "multiple_order_defs.hap") def test_with_insufficient_excols_in_reorder(): assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_insufficient_excols_in_reorder.hap" + DATADIR / "insufficient_excols_in_reorder.hap" ) def test_with_variant_inexistent_haplotype_id(): assert not val_hapfile.is_hapfile_valid( - DATADIR / "valhap_with_variant_inexistent_haplotype_id.hap" + DATADIR / "variant_inexistent_haplotype_id.hap" ) From 4b9583472de66c458d753ea51e58a1c3b61e3312 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 17 Sep 2023 11:02:33 -0700 Subject: [PATCH 28/44] remove test code import prefix --- tests/test_validate.py | 87 +++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 52 deletions(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index ab4b9a7d..fe27c756 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -2,8 +2,7 @@ import pytest -from . import test_data -from haptools import validate as val_hapfile +from haptools.validate import is_hapfile_valid DATADIR = Path(__file__).parent.joinpath("data") / "valhap" @@ -13,155 +12,139 @@ def test_generated_haplotypes(): hapfile = Path(datadir / "simple.hap") pvarfile = Path(datadir / "simple.pvar") - assert val_hapfile.is_hapfile_valid(hapfile, pvar=pvarfile) + assert is_hapfile_valid(hapfile, pvar=pvarfile) def test_with_empty_lines(): - assert val_hapfile.is_hapfile_valid(DATADIR / "empty_lines.hap") + assert is_hapfile_valid(DATADIR / "empty_lines.hap") def test_with_out_of_header_metas_sorted(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "out_of_header_metas.hap", sorted=True - ) + assert not is_hapfile_valid(DATADIR / "out_of_header_metas.hap", sorted=True) def test_with_out_of_header_metas_unsorted(): - assert val_hapfile.is_hapfile_valid( - DATADIR / "out_of_header_metas.hap", sorted=False - ) + assert is_hapfile_valid(DATADIR / "out_of_header_metas.hap", sorted=False) def test_with_10_extras_reordered(): - assert val_hapfile.is_hapfile_valid(DATADIR / "10_extras_reordered.hap") + assert is_hapfile_valid(DATADIR / "10_extras_reordered.hap") def test_with_unexistent_reorders(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "unexistent_reorders.hap") + assert not is_hapfile_valid(DATADIR / "unexistent_reorders.hap") def test_with_unexistent_fields(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "unexistent_fields.hap") + assert not is_hapfile_valid(DATADIR / "unexistent_fields.hap") def test_with_inadequate_version(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "inadequate_version.hap") + assert not is_hapfile_valid(DATADIR / "inadequate_version.hap") def test_with_no_version(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "no_version.hap") + assert not is_hapfile_valid(DATADIR / "no_version.hap") def test_with_multiple_versions(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "multiple_versions.hap") + assert not is_hapfile_valid(DATADIR / "multiple_versions.hap") def test_with_inadequate_version_columns(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "inadequate_version_columns.hap") + assert not is_hapfile_valid(DATADIR / "inadequate_version_columns.hap") def test_with_invalid_column_addition_column_count(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "invalid_column_addition_column_count.hap" - ) + assert not is_hapfile_valid(DATADIR / "invalid_column_addition_column_count.hap") def test_with_invalid_column_addition_types(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "invalid_column_addition_types.hap" - ) + assert not is_hapfile_valid(DATADIR / "invalid_column_addition_types.hap") def test_with_invalid_column_addition_data_types(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "invalid_column_addition_data_types.hap" - ) + assert not is_hapfile_valid(DATADIR / "invalid_column_addition_data_types.hap") def test_with_insufficient_columns(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "insufficient_columns.hap") + assert not is_hapfile_valid(DATADIR / "insufficient_columns.hap") def test_with_inconvertible_starts(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "inconvertible_starts.hap") + assert not is_hapfile_valid(DATADIR / "inconvertible_starts.hap") def test_with_inconvertible_ends(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "inconvertible_ends.hap") + assert not is_hapfile_valid(DATADIR / "inconvertible_ends.hap") def test_with_inconvertible_starts_var(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "inconvertible_starts_var.hap") + assert not is_hapfile_valid(DATADIR / "inconvertible_starts_var.hap") def test_with_inconvertible_ends_var(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "inconvertible_ends_var.hap") + assert not is_hapfile_valid(DATADIR / "inconvertible_ends_var.hap") def test_start_after_end(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "start_after_end.hap") + assert not is_hapfile_valid(DATADIR / "start_after_end.hap") def test_is_directory(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "is_directory.hap") + assert not is_hapfile_valid(DATADIR / "is_directory.hap") def test_with_variant_id_of_chromosome(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "variant_id_of_chromosome.hap") + assert not is_hapfile_valid(DATADIR / "variant_id_of_chromosome.hap") def test_with_hrid_of_chromosome(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "hrid_of_chromosome.hap") + assert not is_hapfile_valid(DATADIR / "hrid_of_chromosome.hap") def test_with_unexistent_col_in_order(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "unexistent_col_in_order.hap") + assert not is_hapfile_valid(DATADIR / "unexistent_col_in_order.hap") def test_with_unassociated_haplotype(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "unassociated_haplotype.hap") + assert not is_hapfile_valid(DATADIR / "unassociated_haplotype.hap") def test_with_unrecognizable_allele(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "unrecognizable_allele.hap") + assert not is_hapfile_valid(DATADIR / "unrecognizable_allele.hap") def test_with_duplicate_ids(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "duplicate_ids.hap") + assert not is_hapfile_valid(DATADIR / "duplicate_ids.hap") def test_with_duplicate_vids_per_haplotype(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "duplicate_vids_per_haplotype.hap" - ) + assert not is_hapfile_valid(DATADIR / "duplicate_vids_per_haplotype.hap") def test_with_excol_of_wrong_type(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "excol_of_wrong_type.hap") + assert not is_hapfile_valid(DATADIR / "excol_of_wrong_type.hap") def test_with_multiple_order_defs(): - assert not val_hapfile.is_hapfile_valid(DATADIR / "multiple_order_defs.hap") + assert not is_hapfile_valid(DATADIR / "multiple_order_defs.hap") def test_with_insufficient_excols_in_reorder(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "insufficient_excols_in_reorder.hap" - ) + assert not is_hapfile_valid(DATADIR / "insufficient_excols_in_reorder.hap") def test_with_variant_inexistent_haplotype_id(): - assert not val_hapfile.is_hapfile_valid( - DATADIR / "variant_inexistent_haplotype_id.hap" - ) + assert not is_hapfile_valid(DATADIR / "variant_inexistent_haplotype_id.hap") def test_with_missing_variant_in_pvar(): pgenlib = pytest.importorskip("pgenlib") - assert not val_hapfile.is_hapfile_valid( + assert not is_hapfile_valid( DATADIR / "simple.hap", pvar=DATADIR / "basic_missing_ids.pvar" ) def test_unreadable_hapfile(): - assert not val_hapfile.is_hapfile_valid(Path("NON_EXISTENT_FILENAME.hap")) + assert not is_hapfile_valid(Path("NON_EXISTENT_FILENAME.hap")) From 5614004fde911b7f0a5d31f04f6115ce0efaded0 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 17 Sep 2023 11:35:24 -0700 Subject: [PATCH 29/44] add tests for command line and add non zero exit code --- docs/commands/validate.rst | 10 +++---- haptools/__main__.py | 4 +-- haptools/validate.py | 2 +- tests/test_validate.py | 59 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 64 insertions(+), 11 deletions(-) diff --git a/docs/commands/validate.rst b/docs/commands/validate.rst index 99b520ea..02cfe7ec 100644 --- a/docs/commands/validate.rst +++ b/docs/commands/validate.rst @@ -30,7 +30,7 @@ Outputs a message specifying the amount of errors and warnings. .. code-block:: - [ INFO] Completed HapFile validation with 0 errors and 0 warnings. + [ INFO] Completed .hap file validation with 0 errors and 0 warnings. All warnings and errors will be logged if there are any. @@ -41,8 +41,8 @@ All warnings and errors will be logged if there are any. .. code-block:: [ WARNING] No version declaration found. Assuming to use the latest version. - [ INFO] Completed HapFile validation with 0 errors and 1 warnings. - [ WARNING] Found several warnings and / or errors in the hapfile + [ INFO] Completed .hap file validation with 0 errors and 1 warnings. + Error: Found several warnings and / or errors in the .hap file One can use ``--no-sort`` to avoid sorting the file. This will make it so that all unordered files will get removed, such as out-of-header lines with meta information. @@ -104,11 +104,11 @@ If the previous example were to be sorted then there would be several errors in All sorted files parse the meta information lines first, thus the ``V`` lines would be incomplete. As mentioned before, one can use the ``--genotypes`` flag to provide a ``.pvar`` file with which to compare the existence of variant IDs. -The following will check if all of the variant IDs in the ``.hap`` appear in the ``.pvar`` file. +The following will check if all of the variant IDs in the ``.hap`` file appear in the ``.pvar`` file. .. code-block:: bash - haptools validate --genotypes tests/data/valhap/test_data.pvar tests/data/valhap/test_data.hap + haptools validate --genotypes tests/data/simple.pvar tests/data/simple.hap .. note:: diff --git a/haptools/__main__.py b/haptools/__main__.py index 5f4c160a..22d1527c 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1062,10 +1062,10 @@ def validate( log = getLogger(name="validate", level=verbosity) - is_valid = is_hapfile_valid(filename, sorted=sort, log=log, pgen=genotypes) + is_valid = is_hapfile_valid(filename, sorted=sort, log=log, pvar=genotypes) if not is_valid: - log.warn("Found several warnings and / or errors in the hapfile") + raise click.ClickException("Found several warnings and / or errors in the .hap file") if __name__ == "__main__": diff --git a/haptools/validate.py b/haptools/validate.py index d682e18c..8896e538 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -868,7 +868,7 @@ def is_hapfile_valid( hapfile.compare_haps_to_pvar(ids) log.info( - f"Completed HapFile validation with {hapfile.errc} errors and" + f"Completed .hap file validation with {hapfile.errc} errors and" f" {hapfile.warc} warnings." ) diff --git a/tests/test_validate.py b/tests/test_validate.py index fe27c756..850e82f6 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,16 +1,18 @@ from pathlib import Path import pytest +from click.testing import CliRunner +from haptools.__main__ import main from haptools.validate import is_hapfile_valid +PARENT_DATADIR = Path(__file__).parent.joinpath("data") DATADIR = Path(__file__).parent.joinpath("data") / "valhap" def test_generated_haplotypes(): - datadir = Path(__file__).parent.joinpath("data") - hapfile = Path(datadir / "simple.hap") - pvarfile = Path(datadir / "simple.pvar") + hapfile = Path(PARENT_DATADIR / "simple.hap") + pvarfile = Path(PARENT_DATADIR / "simple.pvar") assert is_hapfile_valid(hapfile, pvar=pvarfile) @@ -148,3 +150,54 @@ def test_with_missing_variant_in_pvar(): def test_unreadable_hapfile(): assert not is_hapfile_valid(Path("NON_EXISTENT_FILENAME.hap")) + + +def test_basic(capfd): + hp_file = DATADIR / "basic.hap" + + cmd = f"validate {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code == 0 + + +def test_no_version(capfd): + hp_file = DATADIR / "no_version.hap" + + cmd = f"validate {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code != 0 + + +def test_no_version(capfd): + hp_file = DATADIR / "no_version.hap" + + cmd = f"validate {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code != 0 + + +def test_sorted(capfd): + hp_file = DATADIR / "out_of_header_metas.hap" + + cmd = f"validate --sorted {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code != 0 + + cmd = f"validate --no-sorted {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code == 0 + + +def test_with_pvar(capfd): + gt_file = PARENT_DATADIR / "simple.pvar" + hp_file = PARENT_DATADIR / "simple.hap" + + cmd = f"validate --genotypes {gt_file} {hp_file}" + runner = CliRunner() + result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) + assert result.exit_code == 0 From 474f9fc3b483031bc3b1d8bc54bedbcabe25188f Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 17 Sep 2023 16:26:01 -0700 Subject: [PATCH 30/44] clarify how sorting works --- docs/commands/validate.rst | 72 +++++--------------------------------- haptools/__main__.py | 10 +++--- tests/test_validate.py | 4 +-- 3 files changed, 16 insertions(+), 70 deletions(-) diff --git a/docs/commands/validate.rst b/docs/commands/validate.rst index 02cfe7ec..8d6c36e5 100644 --- a/docs/commands/validate.rst +++ b/docs/commands/validate.rst @@ -4,11 +4,9 @@ validate ======== -Validate the structure of a ``.hap`` file. +Validate the formatting of a sorted ``.hap`` file. Output warnings/errors for a ``.hap`` that is improperly formatted. -When a ``.hap`` file contains any errors, they will be logged accordingly. - -If provided, the SNPs and TRs present in the ``.hap`` file will be confirmed to exist in a ``.pvar`` file. +If a ``.pvar`` file is provided, the SNPs and TRs present in the ``.hap`` file will be checked for existence in the ``.pvar`` file. Usage ~~~~~ @@ -44,64 +42,12 @@ All warnings and errors will be logged if there are any. [ INFO] Completed .hap file validation with 0 errors and 1 warnings. Error: Found several warnings and / or errors in the .hap file -One can use ``--no-sort`` to avoid sorting the file. -This will make it so that all unordered files will get removed, such as out-of-header lines with meta information. +All ``.hap`` files must be sorted before they can be validated. By default, we try our best to sort your ``.hap`` file internally before performing any validation checks, but the sorting can fail in certain cases. +If your ``.hap`` file is already sorted, you should use the ``--sorted`` parameter. It will speed things up a bit by skipping the sorting step. If your ``.hap`` file is indexed, it will be assumed to be sorted. .. code-block:: bash - haptools validate --no-sort tests/data/valhap/out_of_header_metas.hap - -Will turn: - -.. code-block:: - - # orderH ancestry beta - # version 0.2.0 - #H ancestry s Local ancestry - #H beta .2f Effect size in linear model - #R beta .2f Effect size in linear model - H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 - R 21 26938353 26938400 21_26938353_STR 0.45 - H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 - H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 - # This should cause an error if the file is sorted - #V test_field s A field to test with - V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C - V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T - V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C - V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G - V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A - V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T - V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A - V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T - V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A - -Into - -.. code-block:: - - # orderH ancestry beta - # version 0.2.0 - #H ancestry s Local ancestry - #H beta .2f Effect size in linear model - #R beta .2f Effect size in linear model - H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 - R 21 26938353 26938400 21_26938353_STR 0.45 - H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 - H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 - V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C - V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T - V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C - V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G - V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A - V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T - V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A - V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T - V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A - - -If the previous example were to be sorted then there would be several errors in the ``.hap`` file. -All sorted files parse the meta information lines first, thus the ``V`` lines would be incomplete. + haptools validate --sorted tests/data/valhap/out_of_header_metas.hap As mentioned before, one can use the ``--genotypes`` flag to provide a ``.pvar`` file with which to compare the existence of variant IDs. The following will check if all of the variant IDs in the ``.hap`` file appear in the ``.pvar`` file. @@ -112,10 +58,10 @@ The following will check if all of the variant IDs in the ``.hap`` file appear i .. note:: - We accept a PVAR file instead of a VCF in order to avoid reading lots of - information which is not relevant to the validation process. However, any - VCF wihtout a FORMAT field is a valid PVAR file. So you can easily create a PVAR file - using the ``cut`` command or ``plink2 --make-just-pvar``. + We accept a PVAR file instead of a VCF in order to avoid reading lots of information + which is not relevant to the validation process. However, any VCF subsetted to just + its first 8 fields is a valid PVAR file. So you can easily create a PVAR file from a + VCF using ``cut -f -8`` or ``plink2 --make-just-pvar``. Detailed Usage ~~~~~~~~~~~~~~ diff --git a/haptools/__main__.py b/haptools/__main__.py index 22d1527c..97e840e7 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1028,11 +1028,11 @@ def clump( @main.command(short_help="Validate the structure of a .hap file") @click.argument("filename", type=click.Path(exists=True, path_type=Path)) @click.option( - "--sort/--no-sort", + "--sorted/--not-sorted", is_flag=True, - default=True, + default=False, show_default=True, - help="Sorting of the file will not be performed", + help="Has the file been sorted already?", ) @click.option( "--genotypes", @@ -1053,7 +1053,7 @@ def clump( ) def validate( filename: Path, - sort: bool, + sorted: bool = False, genotypes: Path | None = None, verbosity: str = "INFO", ): @@ -1062,7 +1062,7 @@ def validate( log = getLogger(name="validate", level=verbosity) - is_valid = is_hapfile_valid(filename, sorted=sort, log=log, pvar=genotypes) + is_valid = is_hapfile_valid(filename, sorted=(not sorted), log=log, pvar=genotypes) if not is_valid: raise click.ClickException("Found several warnings and / or errors in the .hap file") diff --git a/tests/test_validate.py b/tests/test_validate.py index 850e82f6..b46a40d8 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -182,12 +182,12 @@ def test_no_version(capfd): def test_sorted(capfd): hp_file = DATADIR / "out_of_header_metas.hap" - cmd = f"validate --sorted {hp_file}" + cmd = f"validate --not-sorted {hp_file}" runner = CliRunner() result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) assert result.exit_code != 0 - cmd = f"validate --no-sorted {hp_file}" + cmd = f"validate --sorted {hp_file}" runner = CliRunner() result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) assert result.exit_code == 0 From 6b7942cb89a96cb0cb58e5484a2d854518c5f01f Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 17 Sep 2023 17:12:36 -0700 Subject: [PATCH 31/44] change behavior of sorting parameter --- docs/commands/validate.rst | 12 ++++++++---- docs/formats/genotypes.rst | 2 +- haptools/__main__.py | 6 +++++- haptools/validate.py | 12 ++++++------ tests/test_validate.py | 9 ++------- 5 files changed, 22 insertions(+), 19 deletions(-) diff --git a/docs/commands/validate.rst b/docs/commands/validate.rst index 8d6c36e5..136dabe1 100644 --- a/docs/commands/validate.rst +++ b/docs/commands/validate.rst @@ -4,9 +4,13 @@ validate ======== -Validate the formatting of a sorted ``.hap`` file. Output warnings/errors for a ``.hap`` that is improperly formatted. +Validate the formatting of a sorted :doc:`.hap file `. Output warnings/errors explaining how the formatting of your ``.hap`` file may be improved. -If a ``.pvar`` file is provided, the SNPs and TRs present in the ``.hap`` file will be checked for existence in the ``.pvar`` file. +If a :ref:`.pvar file ` file is provided, the SNPs and TRs present in the ``.hap`` file will be checked for existence in the ``.pvar`` file. + +.. note:: + + This command will not check that your ``.hap`` file is properly sorted. It only checks formatting. Usage ~~~~~ @@ -42,12 +46,12 @@ All warnings and errors will be logged if there are any. [ INFO] Completed .hap file validation with 0 errors and 1 warnings. Error: Found several warnings and / or errors in the .hap file -All ``.hap`` files must be sorted before they can be validated. By default, we try our best to sort your ``.hap`` file internally before performing any validation checks, but the sorting can fail in certain cases. +All ``.hap`` files must be sorted before they can be validated. By default, we try our best to sort your ``.hap`` file internally before performing any validation checks. If your ``.hap`` file is already sorted, you should use the ``--sorted`` parameter. It will speed things up a bit by skipping the sorting step. If your ``.hap`` file is indexed, it will be assumed to be sorted. .. code-block:: bash - haptools validate --sorted tests/data/valhap/out_of_header_metas.hap + haptools validate --sorted tests/data/simple.hap As mentioned before, one can use the ``--genotypes`` flag to provide a ``.pvar`` file with which to compare the existence of variant IDs. The following will check if all of the variant IDs in the ``.hap`` file appear in the ``.pvar`` file. diff --git a/docs/formats/genotypes.rst b/docs/formats/genotypes.rst index 5828926e..3a2a6261 100644 --- a/docs/formats/genotypes.rst +++ b/docs/formats/genotypes.rst @@ -18,7 +18,7 @@ Genotype files must be specified as VCF or BCF files. They can be bgzip-compress PLINK2 PGEN ~~~~~~~~~~~ -There is also experimental support for `PLINK2 PGEN `_ files in some commands. These files can be loaded and created much more quickly than VCFs, so we highly recommend using them if you're working with large datasets. See the documentation for the :class:`GenotypesPLINK` class in :ref:`the API docs ` for more information. +There is also experimental support for `PLINK2 PGEN `_ files (accomponied by PVAR and PSAM files) in some commands. These files can be loaded and created much more quickly than VCFs, so we highly recommend using them if you're working with large datasets. See the documentation for the :class:`GenotypesPLINK` class in :ref:`the API docs ` for more information. If you run out memory when using PGEN files, consider reading/writing variants from the file in chunks via the ``--chunk-size`` parameter. diff --git a/haptools/__main__.py b/haptools/__main__.py index 97e840e7..108e40fb 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1062,7 +1062,11 @@ def validate( log = getLogger(name="validate", level=verbosity) - is_valid = is_hapfile_valid(filename, sorted=(not sorted), log=log, pvar=genotypes) + # if the hap file is compressed and a .tbi index exists for it, assume it is sorted + if filename.suffix == ".gz" and filename.with_suffix(".gz.tbi").exists(): + sorted = True + + is_valid = is_hapfile_valid(filename, sorted=sorted, log=log, pvar=genotypes) if not is_valid: raise click.ClickException("Found several warnings and / or errors in the .hap file") diff --git a/haptools/validate.py b/haptools/validate.py index 8896e538..9a31f9b3 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -37,7 +37,7 @@ def __init__(self, filename: Path, logger=None): self.filename = filename self.log = logger or logging.getLogger(self.__class__.__name__) - def lines(self, sorted: bool = True) -> list[Line]: + def lines(self, sorted: bool = False) -> list[Line]: buffer = open(self.filename) content = [ @@ -49,6 +49,7 @@ def lines(self, sorted: bool = True) -> list[Line]: buffer.close() if not sorted: + self.log.debug("Assuming .hap file is unsorted. Attempting to sort.") meta_limit = next( idx for idx, line in enumerate(content) if not line[0].startswith("#") ) @@ -57,9 +58,8 @@ def lines(self, sorted: bool = True) -> list[Line]: for idx, line in enumerate(content) if (not line[0].startswith("#")) or idx < meta_limit ] - - # lol - content.sort(key=lambda line: ord(line[0][0])) + content.sort(key=lambda line: ord(line[0][0])) + self.log.debug("Finished sorting .hap file") return content @@ -156,7 +156,7 @@ def __init__(self, logger=None): self.errc: int = 0 self.warc: int = 0 - def extract_and_store_content(self, file: HapFileIO, sorted: bool = True): + def extract_and_store_content(self, file: HapFileIO, sorted: bool = False): lines = file.lines(sorted=sorted) self.extract_meta_lines(lines) @@ -829,7 +829,7 @@ def warnskip(self, line: Line): def is_hapfile_valid( filename: Path, - sorted: bool = True, + sorted: bool = False, pvar: Path | None = None, max_variants: int = 10000, log: logging.Logger = None, diff --git a/tests/test_validate.py b/tests/test_validate.py index b46a40d8..1a997a4e 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -22,7 +22,7 @@ def test_with_empty_lines(): def test_with_out_of_header_metas_sorted(): - assert not is_hapfile_valid(DATADIR / "out_of_header_metas.hap", sorted=True) + assert is_hapfile_valid(DATADIR / "out_of_header_metas.hap", sorted=True) def test_with_out_of_header_metas_unsorted(): @@ -180,12 +180,7 @@ def test_no_version(capfd): def test_sorted(capfd): - hp_file = DATADIR / "out_of_header_metas.hap" - - cmd = f"validate --not-sorted {hp_file}" - runner = CliRunner() - result = runner.invoke(main, cmd.split(" "), catch_exceptions=False) - assert result.exit_code != 0 + hp_file = PARENT_DATADIR / "simple.hap" cmd = f"validate --sorted {hp_file}" runner = CliRunner() From d16f7bd5107a1bc7c7743b4ae82c1acf7b86ddb1 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 30 Sep 2023 20:50:45 -0700 Subject: [PATCH 32/44] do not skip pytest for pgenlib --- tests/test_validate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_validate.py b/tests/test_validate.py index 1a997a4e..15f1574c 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -142,7 +142,6 @@ def test_with_variant_inexistent_haplotype_id(): def test_with_missing_variant_in_pvar(): - pgenlib = pytest.importorskip("pgenlib") assert not is_hapfile_valid( DATADIR / "simple.hap", pvar=DATADIR / "basic_missing_ids.pvar" ) From fc71adf68d4d55fbfc65fa66f4fdf188e7603779 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Mon, 2 Oct 2023 09:42:16 -0700 Subject: [PATCH 33/44] refmt with black --- haptools/__main__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haptools/__main__.py b/haptools/__main__.py index 108e40fb..5e5675b2 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1069,7 +1069,9 @@ def validate( is_valid = is_hapfile_valid(filename, sorted=sorted, log=log, pvar=genotypes) if not is_valid: - raise click.ClickException("Found several warnings and / or errors in the .hap file") + raise click.ClickException( + "Found several warnings and / or errors in the .hap file" + ) if __name__ == "__main__": From 6065862dc32b367026808d033336538c2f427bd4 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 13 Oct 2023 19:18:31 -0700 Subject: [PATCH 34/44] remove extra files outside of test dir --- .../valhap/is_directory.hap/{keep.git => .gitkeep} | 0 tests/test.hap | 10 ---------- tests/test.pvar | 8 -------- tests/test_validate.py | 3 +-- 4 files changed, 1 insertion(+), 20 deletions(-) rename tests/data/valhap/is_directory.hap/{keep.git => .gitkeep} (100%) delete mode 100644 tests/test.hap delete mode 100644 tests/test.pvar diff --git a/tests/data/valhap/is_directory.hap/keep.git b/tests/data/valhap/is_directory.hap/.gitkeep similarity index 100% rename from tests/data/valhap/is_directory.hap/keep.git rename to tests/data/valhap/is_directory.hap/.gitkeep diff --git a/tests/test.hap b/tests/test.hap deleted file mode 100644 index 1b06286a..00000000 --- a/tests/test.hap +++ /dev/null @@ -1,10 +0,0 @@ -# version 0.2.0 -H 1 10114 8 H1 -H 1 10114 10119 H2 -H 1 10116 10119 H3 -V H1 10114 10115 1:10114:T:C T -V H1 10116 10117 1:10116:A:G G -V H2 10114 10115 1:10114:T:C C -V H2 10117 10118 1:10117:C:A C -V H3 10116 10117 1:10116:A:G A -V H3 10117 10118 1:10117:C:A A diff --git a/tests/test.pvar b/tests/test.pvar deleted file mode 100644 index ca962b0b..00000000 --- a/tests/test.pvar +++ /dev/null @@ -1,8 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##contig= -#CHROM POS ID REF ALT QUAL FILTER INFO -1 10114 1:10114:T:C T C . . . -1 10116 1:10116:A:G A G . . . -1 10117 1:10117:C:A C A . . . -1 10122 1:10122:A:G A G . . . diff --git a/tests/test_validate.py b/tests/test_validate.py index 15f1574c..5eb79712 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,6 +1,5 @@ from pathlib import Path -import pytest from click.testing import CliRunner from haptools.__main__ import main @@ -143,7 +142,7 @@ def test_with_variant_inexistent_haplotype_id(): def test_with_missing_variant_in_pvar(): assert not is_hapfile_valid( - DATADIR / "simple.hap", pvar=DATADIR / "basic_missing_ids.pvar" + DATADIR / "simple.hap", pvar=DATADIR / "basic_missing_ids.pvar", ) From 50d5cb3ac5e464b7d568fc7bd6195bef196123d6 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 13 Oct 2023 19:20:39 -0700 Subject: [PATCH 35/44] rename valhap test dir to validate --- tests/data/{valhap => validate}/10_extras_reordered.hap | 0 tests/data/{valhap => validate}/basic.hap | 0 tests/data/{valhap => validate}/basic.pvar | 0 tests/data/{valhap => validate}/basic_missing_ids.pvar | 0 tests/data/{valhap => validate}/correct.hap | 0 tests/data/{valhap => validate}/duplicate_ids.hap | 0 .../data/{valhap => validate}/duplicate_vids_per_haplotype.hap | 0 tests/data/{valhap => validate}/empty_lines.hap | 0 tests/data/{valhap => validate}/excol_of_wrong_type.hap | 0 tests/data/{valhap => validate}/hrid_of_chromosome.hap | 0 tests/data/{valhap => validate}/inadequate_version.hap | 0 tests/data/{valhap => validate}/inadequate_version_columns.hap | 0 tests/data/{valhap => validate}/inconvertible_ends.hap | 0 tests/data/{valhap => validate}/inconvertible_ends_var.hap | 0 tests/data/{valhap => validate}/inconvertible_starts.hap | 0 tests/data/{valhap => validate}/inconvertible_starts_var.hap | 0 tests/data/{valhap => validate}/insufficient_columns.hap | 0 .../{valhap => validate}/insufficient_excols_in_reorder.hap | 0 .../invalid_column_addition_column_count.hap | 0 .../{valhap => validate}/invalid_column_addition_data_types.hap | 0 .../data/{valhap => validate}/invalid_column_addition_types.hap | 0 tests/data/{valhap => validate}/is_directory.hap/.gitkeep | 0 tests/data/{valhap => validate}/multiple_order_defs.hap | 0 tests/data/{valhap => validate}/multiple_versions.hap | 0 tests/data/{valhap => validate}/no_version.hap | 0 tests/data/{valhap => validate}/out_of_header_metas.hap | 0 tests/data/{valhap => validate}/simple.hap | 0 tests/data/{valhap => validate}/simple.pvar | 0 tests/data/{valhap => validate}/start_after_end.hap | 0 tests/data/{valhap => validate}/unassociated_haplotype.hap | 0 tests/data/{valhap => validate}/unexistent_col_in_order.hap | 0 tests/data/{valhap => validate}/unexistent_fields.hap | 0 tests/data/{valhap => validate}/unexistent_reorders.hap | 0 tests/data/{valhap => validate}/unrecognizable_allele.hap | 0 tests/data/{valhap => validate}/variant_id_of_chromosome.hap | 0 .../{valhap => validate}/variant_inexistent_haplotype_id.hap | 0 tests/test_validate.py | 2 +- 37 files changed, 1 insertion(+), 1 deletion(-) rename tests/data/{valhap => validate}/10_extras_reordered.hap (100%) rename tests/data/{valhap => validate}/basic.hap (100%) rename tests/data/{valhap => validate}/basic.pvar (100%) rename tests/data/{valhap => validate}/basic_missing_ids.pvar (100%) rename tests/data/{valhap => validate}/correct.hap (100%) rename tests/data/{valhap => validate}/duplicate_ids.hap (100%) rename tests/data/{valhap => validate}/duplicate_vids_per_haplotype.hap (100%) rename tests/data/{valhap => validate}/empty_lines.hap (100%) rename tests/data/{valhap => validate}/excol_of_wrong_type.hap (100%) rename tests/data/{valhap => validate}/hrid_of_chromosome.hap (100%) rename tests/data/{valhap => validate}/inadequate_version.hap (100%) rename tests/data/{valhap => validate}/inadequate_version_columns.hap (100%) rename tests/data/{valhap => validate}/inconvertible_ends.hap (100%) rename tests/data/{valhap => validate}/inconvertible_ends_var.hap (100%) rename tests/data/{valhap => validate}/inconvertible_starts.hap (100%) rename tests/data/{valhap => validate}/inconvertible_starts_var.hap (100%) rename tests/data/{valhap => validate}/insufficient_columns.hap (100%) rename tests/data/{valhap => validate}/insufficient_excols_in_reorder.hap (100%) rename tests/data/{valhap => validate}/invalid_column_addition_column_count.hap (100%) rename tests/data/{valhap => validate}/invalid_column_addition_data_types.hap (100%) rename tests/data/{valhap => validate}/invalid_column_addition_types.hap (100%) rename tests/data/{valhap => validate}/is_directory.hap/.gitkeep (100%) rename tests/data/{valhap => validate}/multiple_order_defs.hap (100%) rename tests/data/{valhap => validate}/multiple_versions.hap (100%) rename tests/data/{valhap => validate}/no_version.hap (100%) rename tests/data/{valhap => validate}/out_of_header_metas.hap (100%) rename tests/data/{valhap => validate}/simple.hap (100%) rename tests/data/{valhap => validate}/simple.pvar (100%) rename tests/data/{valhap => validate}/start_after_end.hap (100%) rename tests/data/{valhap => validate}/unassociated_haplotype.hap (100%) rename tests/data/{valhap => validate}/unexistent_col_in_order.hap (100%) rename tests/data/{valhap => validate}/unexistent_fields.hap (100%) rename tests/data/{valhap => validate}/unexistent_reorders.hap (100%) rename tests/data/{valhap => validate}/unrecognizable_allele.hap (100%) rename tests/data/{valhap => validate}/variant_id_of_chromosome.hap (100%) rename tests/data/{valhap => validate}/variant_inexistent_haplotype_id.hap (100%) diff --git a/tests/data/valhap/10_extras_reordered.hap b/tests/data/validate/10_extras_reordered.hap similarity index 100% rename from tests/data/valhap/10_extras_reordered.hap rename to tests/data/validate/10_extras_reordered.hap diff --git a/tests/data/valhap/basic.hap b/tests/data/validate/basic.hap similarity index 100% rename from tests/data/valhap/basic.hap rename to tests/data/validate/basic.hap diff --git a/tests/data/valhap/basic.pvar b/tests/data/validate/basic.pvar similarity index 100% rename from tests/data/valhap/basic.pvar rename to tests/data/validate/basic.pvar diff --git a/tests/data/valhap/basic_missing_ids.pvar b/tests/data/validate/basic_missing_ids.pvar similarity index 100% rename from tests/data/valhap/basic_missing_ids.pvar rename to tests/data/validate/basic_missing_ids.pvar diff --git a/tests/data/valhap/correct.hap b/tests/data/validate/correct.hap similarity index 100% rename from tests/data/valhap/correct.hap rename to tests/data/validate/correct.hap diff --git a/tests/data/valhap/duplicate_ids.hap b/tests/data/validate/duplicate_ids.hap similarity index 100% rename from tests/data/valhap/duplicate_ids.hap rename to tests/data/validate/duplicate_ids.hap diff --git a/tests/data/valhap/duplicate_vids_per_haplotype.hap b/tests/data/validate/duplicate_vids_per_haplotype.hap similarity index 100% rename from tests/data/valhap/duplicate_vids_per_haplotype.hap rename to tests/data/validate/duplicate_vids_per_haplotype.hap diff --git a/tests/data/valhap/empty_lines.hap b/tests/data/validate/empty_lines.hap similarity index 100% rename from tests/data/valhap/empty_lines.hap rename to tests/data/validate/empty_lines.hap diff --git a/tests/data/valhap/excol_of_wrong_type.hap b/tests/data/validate/excol_of_wrong_type.hap similarity index 100% rename from tests/data/valhap/excol_of_wrong_type.hap rename to tests/data/validate/excol_of_wrong_type.hap diff --git a/tests/data/valhap/hrid_of_chromosome.hap b/tests/data/validate/hrid_of_chromosome.hap similarity index 100% rename from tests/data/valhap/hrid_of_chromosome.hap rename to tests/data/validate/hrid_of_chromosome.hap diff --git a/tests/data/valhap/inadequate_version.hap b/tests/data/validate/inadequate_version.hap similarity index 100% rename from tests/data/valhap/inadequate_version.hap rename to tests/data/validate/inadequate_version.hap diff --git a/tests/data/valhap/inadequate_version_columns.hap b/tests/data/validate/inadequate_version_columns.hap similarity index 100% rename from tests/data/valhap/inadequate_version_columns.hap rename to tests/data/validate/inadequate_version_columns.hap diff --git a/tests/data/valhap/inconvertible_ends.hap b/tests/data/validate/inconvertible_ends.hap similarity index 100% rename from tests/data/valhap/inconvertible_ends.hap rename to tests/data/validate/inconvertible_ends.hap diff --git a/tests/data/valhap/inconvertible_ends_var.hap b/tests/data/validate/inconvertible_ends_var.hap similarity index 100% rename from tests/data/valhap/inconvertible_ends_var.hap rename to tests/data/validate/inconvertible_ends_var.hap diff --git a/tests/data/valhap/inconvertible_starts.hap b/tests/data/validate/inconvertible_starts.hap similarity index 100% rename from tests/data/valhap/inconvertible_starts.hap rename to tests/data/validate/inconvertible_starts.hap diff --git a/tests/data/valhap/inconvertible_starts_var.hap b/tests/data/validate/inconvertible_starts_var.hap similarity index 100% rename from tests/data/valhap/inconvertible_starts_var.hap rename to tests/data/validate/inconvertible_starts_var.hap diff --git a/tests/data/valhap/insufficient_columns.hap b/tests/data/validate/insufficient_columns.hap similarity index 100% rename from tests/data/valhap/insufficient_columns.hap rename to tests/data/validate/insufficient_columns.hap diff --git a/tests/data/valhap/insufficient_excols_in_reorder.hap b/tests/data/validate/insufficient_excols_in_reorder.hap similarity index 100% rename from tests/data/valhap/insufficient_excols_in_reorder.hap rename to tests/data/validate/insufficient_excols_in_reorder.hap diff --git a/tests/data/valhap/invalid_column_addition_column_count.hap b/tests/data/validate/invalid_column_addition_column_count.hap similarity index 100% rename from tests/data/valhap/invalid_column_addition_column_count.hap rename to tests/data/validate/invalid_column_addition_column_count.hap diff --git a/tests/data/valhap/invalid_column_addition_data_types.hap b/tests/data/validate/invalid_column_addition_data_types.hap similarity index 100% rename from tests/data/valhap/invalid_column_addition_data_types.hap rename to tests/data/validate/invalid_column_addition_data_types.hap diff --git a/tests/data/valhap/invalid_column_addition_types.hap b/tests/data/validate/invalid_column_addition_types.hap similarity index 100% rename from tests/data/valhap/invalid_column_addition_types.hap rename to tests/data/validate/invalid_column_addition_types.hap diff --git a/tests/data/valhap/is_directory.hap/.gitkeep b/tests/data/validate/is_directory.hap/.gitkeep similarity index 100% rename from tests/data/valhap/is_directory.hap/.gitkeep rename to tests/data/validate/is_directory.hap/.gitkeep diff --git a/tests/data/valhap/multiple_order_defs.hap b/tests/data/validate/multiple_order_defs.hap similarity index 100% rename from tests/data/valhap/multiple_order_defs.hap rename to tests/data/validate/multiple_order_defs.hap diff --git a/tests/data/valhap/multiple_versions.hap b/tests/data/validate/multiple_versions.hap similarity index 100% rename from tests/data/valhap/multiple_versions.hap rename to tests/data/validate/multiple_versions.hap diff --git a/tests/data/valhap/no_version.hap b/tests/data/validate/no_version.hap similarity index 100% rename from tests/data/valhap/no_version.hap rename to tests/data/validate/no_version.hap diff --git a/tests/data/valhap/out_of_header_metas.hap b/tests/data/validate/out_of_header_metas.hap similarity index 100% rename from tests/data/valhap/out_of_header_metas.hap rename to tests/data/validate/out_of_header_metas.hap diff --git a/tests/data/valhap/simple.hap b/tests/data/validate/simple.hap similarity index 100% rename from tests/data/valhap/simple.hap rename to tests/data/validate/simple.hap diff --git a/tests/data/valhap/simple.pvar b/tests/data/validate/simple.pvar similarity index 100% rename from tests/data/valhap/simple.pvar rename to tests/data/validate/simple.pvar diff --git a/tests/data/valhap/start_after_end.hap b/tests/data/validate/start_after_end.hap similarity index 100% rename from tests/data/valhap/start_after_end.hap rename to tests/data/validate/start_after_end.hap diff --git a/tests/data/valhap/unassociated_haplotype.hap b/tests/data/validate/unassociated_haplotype.hap similarity index 100% rename from tests/data/valhap/unassociated_haplotype.hap rename to tests/data/validate/unassociated_haplotype.hap diff --git a/tests/data/valhap/unexistent_col_in_order.hap b/tests/data/validate/unexistent_col_in_order.hap similarity index 100% rename from tests/data/valhap/unexistent_col_in_order.hap rename to tests/data/validate/unexistent_col_in_order.hap diff --git a/tests/data/valhap/unexistent_fields.hap b/tests/data/validate/unexistent_fields.hap similarity index 100% rename from tests/data/valhap/unexistent_fields.hap rename to tests/data/validate/unexistent_fields.hap diff --git a/tests/data/valhap/unexistent_reorders.hap b/tests/data/validate/unexistent_reorders.hap similarity index 100% rename from tests/data/valhap/unexistent_reorders.hap rename to tests/data/validate/unexistent_reorders.hap diff --git a/tests/data/valhap/unrecognizable_allele.hap b/tests/data/validate/unrecognizable_allele.hap similarity index 100% rename from tests/data/valhap/unrecognizable_allele.hap rename to tests/data/validate/unrecognizable_allele.hap diff --git a/tests/data/valhap/variant_id_of_chromosome.hap b/tests/data/validate/variant_id_of_chromosome.hap similarity index 100% rename from tests/data/valhap/variant_id_of_chromosome.hap rename to tests/data/validate/variant_id_of_chromosome.hap diff --git a/tests/data/valhap/variant_inexistent_haplotype_id.hap b/tests/data/validate/variant_inexistent_haplotype_id.hap similarity index 100% rename from tests/data/valhap/variant_inexistent_haplotype_id.hap rename to tests/data/validate/variant_inexistent_haplotype_id.hap diff --git a/tests/test_validate.py b/tests/test_validate.py index 5eb79712..39fc9d87 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -6,7 +6,7 @@ from haptools.validate import is_hapfile_valid PARENT_DATADIR = Path(__file__).parent.joinpath("data") -DATADIR = Path(__file__).parent.joinpath("data") / "valhap" +DATADIR = Path(__file__).parent.joinpath("data") / "validate" def test_generated_haplotypes(): From 46ac080fa4201606a871e33cf0ab4a23755b6125 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 13 Oct 2023 20:16:44 -0700 Subject: [PATCH 36/44] add descriptions to all test commands --- docs/commands/validate.rst | 4 +- haptools/validate.py | 4 +- tests/test_validate.py | 105 ++++++++++++++++++++++++++++++++++++- 3 files changed, 108 insertions(+), 5 deletions(-) diff --git a/docs/commands/validate.rst b/docs/commands/validate.rst index 136dabe1..a4e822f8 100644 --- a/docs/commands/validate.rst +++ b/docs/commands/validate.rst @@ -26,7 +26,7 @@ Examples ~~~~~~~~ .. code-block:: bash - haptools validate tests/data/valhap/basic.hap + haptools validate tests/data/validate/basic.hap Outputs a message specifying the amount of errors and warnings. @@ -38,7 +38,7 @@ All warnings and errors will be logged if there are any. .. code-block:: bash - haptools validate tests/data/valhap/no_version.hap + haptools validate tests/data/validate/no_version.hap .. code-block:: diff --git a/haptools/validate.py b/haptools/validate.py index 9a31f9b3..4fb89ff0 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -81,10 +81,10 @@ def validate_existence(self) -> bool: return is_ok def exists(self) -> bool: - return os.path.exists(self.filename) + return self.filename.exists() def is_regular(self): - return os.path.isfile(self.filename) + return self.filename.is_file() def is_readable(self) -> bool: return os.access(self.filename, os.R_OK) diff --git a/tests/test_validate.py b/tests/test_validate.py index 39fc9d87..11275f4f 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -10,6 +10,9 @@ def test_generated_haplotypes(): + """ + Tests the dummy .hap generated by the haptools test suite + """ hapfile = Path(PARENT_DATADIR / "simple.hap") pvarfile = Path(PARENT_DATADIR / "simple.pvar") @@ -17,136 +20,236 @@ def test_generated_haplotypes(): def test_with_empty_lines(): + """ + Tests a .hap with empty lines + """ assert is_hapfile_valid(DATADIR / "empty_lines.hap") def test_with_out_of_header_metas_sorted(): + """ + Test a sorted .hap with meta lines out of the header + """ assert is_hapfile_valid(DATADIR / "out_of_header_metas.hap", sorted=True) def test_with_out_of_header_metas_unsorted(): + """ + Test an unsorted .hap with meta lines out of the header + """ assert is_hapfile_valid(DATADIR / "out_of_header_metas.hap", sorted=False) def test_with_10_extras_reordered(): + """ + Tests a .hap file with 10 extra columns + """ assert is_hapfile_valid(DATADIR / "10_extras_reordered.hap") def test_with_unexistent_reorders(): + """ + Tests a .hap with an order[H|R|V] which mentions a non-existent extra column + """ assert not is_hapfile_valid(DATADIR / "unexistent_reorders.hap") def test_with_unexistent_fields(): + """ + Tests a .hap with a data line that is not an H, R or V + """ assert not is_hapfile_valid(DATADIR / "unexistent_fields.hap") def test_with_inadequate_version(): + """ + Tests a .hap with an incorrectly formatted version + """ assert not is_hapfile_valid(DATADIR / "inadequate_version.hap") def test_with_no_version(): + """ + Tests a .hap with no present version + """ assert not is_hapfile_valid(DATADIR / "no_version.hap") def test_with_multiple_versions(): + """ + Tests a .hap with several versions present + """ assert not is_hapfile_valid(DATADIR / "multiple_versions.hap") def test_with_inadequate_version_columns(): + """ + Tests a .hap with a version column of only 2 fields + """ assert not is_hapfile_valid(DATADIR / "inadequate_version_columns.hap") def test_with_invalid_column_addition_column_count(): + """ + Tests a .hap with an extra column declaration of invalid column count + """ assert not is_hapfile_valid(DATADIR / "invalid_column_addition_column_count.hap") def test_with_invalid_column_addition_types(): + """ + Tests a .hap with a column addition for a type which is not H, R or V + """ assert not is_hapfile_valid(DATADIR / "invalid_column_addition_types.hap") def test_with_invalid_column_addition_data_types(): + """ + Tests a .hap with a column addition of unrecognized data type (not s, d or .nf) + """ assert not is_hapfile_valid(DATADIR / "invalid_column_addition_data_types.hap") def test_with_insufficient_columns(): + """ + Tests a .hap with insufficient mandatory columns + """ assert not is_hapfile_valid(DATADIR / "insufficient_columns.hap") def test_with_inconvertible_starts(): + """ + Tests a .hap with start positions that can't be converted to integers + """ assert not is_hapfile_valid(DATADIR / "inconvertible_starts.hap") def test_with_inconvertible_ends(): + """ + Tests a .hap with end positions that can't be converted to integers + """ assert not is_hapfile_valid(DATADIR / "inconvertible_ends.hap") def test_with_inconvertible_starts_var(): + """ + Tests a .hap with start positions that can't be converted to integers in variants + """ assert not is_hapfile_valid(DATADIR / "inconvertible_starts_var.hap") def test_with_inconvertible_ends_var(): + """ + Tests a .hap with end positions that can't be converted to integers in variants + """ assert not is_hapfile_valid(DATADIR / "inconvertible_ends_var.hap") def test_start_after_end(): + """ + Tests a .hap with the start position placed after the end position + """ assert not is_hapfile_valid(DATADIR / "start_after_end.hap") def test_is_directory(): + """ + Tests a validation command with a filename that points to a directory + """ assert not is_hapfile_valid(DATADIR / "is_directory.hap") def test_with_variant_id_of_chromosome(): + """ + Tests a .hap with a variant whose ID is the same as a chromosome ID + """ assert not is_hapfile_valid(DATADIR / "variant_id_of_chromosome.hap") def test_with_hrid_of_chromosome(): + """ + Tests a .hap with a haplotype or repeat with the same ID as a chromosome + """ assert not is_hapfile_valid(DATADIR / "hrid_of_chromosome.hap") def test_with_unexistent_col_in_order(): + """ + Tests a .hap with an order[H|R|V] field that references a non-existent extra column name + """ assert not is_hapfile_valid(DATADIR / "unexistent_col_in_order.hap") def test_with_unassociated_haplotype(): + """ + Tests a .hap with a haplotype that does not have at least one matching repeat + """ assert not is_hapfile_valid(DATADIR / "unassociated_haplotype.hap") def test_with_unrecognizable_allele(): + """ + Tests a .hap with a variant whose allele is not G, C, T or A + """ assert not is_hapfile_valid(DATADIR / "unrecognizable_allele.hap") def test_with_duplicate_ids(): + """ + Tests a .hap with duplicate IDs for H and R fields + """ assert not is_hapfile_valid(DATADIR / "duplicate_ids.hap") def test_with_duplicate_vids_per_haplotype(): + """ + Tests a .hap with duplicate IDs for variants with the same haplotype association + """ assert not is_hapfile_valid(DATADIR / "duplicate_vids_per_haplotype.hap") def test_with_excol_of_wrong_type(): + """ + Tests a .hap with a data line which contains an extra column of d data type but receives s + """ assert not is_hapfile_valid(DATADIR / "excol_of_wrong_type.hap") def test_with_multiple_order_defs(): + """ + Tests a .hap with multiple order[H|R|V] of the same type + """ assert not is_hapfile_valid(DATADIR / "multiple_order_defs.hap") def test_with_insufficient_excols_in_reorder(): + """ + Tests a .hap with an order[H|R|V] that does not reference all extra columns + """ assert not is_hapfile_valid(DATADIR / "insufficient_excols_in_reorder.hap") def test_with_variant_inexistent_haplotype_id(): + """ + Tests a .hap with with a variant that references a non-existent haplotype + """ assert not is_hapfile_valid(DATADIR / "variant_inexistent_haplotype_id.hap") def test_with_missing_variant_in_pvar(): + """ + Tests a .hap along with a .pvar file which is missing an ID present in the .hap + """ assert not is_hapfile_valid( - DATADIR / "simple.hap", pvar=DATADIR / "basic_missing_ids.pvar", + DATADIR / "simple.hap", + pvar=DATADIR / "basic_missing_ids.pvar", ) def test_unreadable_hapfile(): + """ + Passes a non-existent file to the validator + """ assert not is_hapfile_valid(Path("NON_EXISTENT_FILENAME.hap")) From 3db45227fcdc360731af01856ca923b727fd2c48 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 13 Oct 2023 20:17:19 -0700 Subject: [PATCH 37/44] fail validation if any lines are blank --- tests/data/validate/empty_lines.hap | 27 ++++----------------------- tests/test_validate.py | 2 +- 2 files changed, 5 insertions(+), 24 deletions(-) diff --git a/tests/data/validate/empty_lines.hap b/tests/data/validate/empty_lines.hap index 32e2914b..d19089d2 100644 --- a/tests/data/validate/empty_lines.hap +++ b/tests/data/validate/empty_lines.hap @@ -1,26 +1,7 @@ -# orderH ancestry beta # version 0.2.0 +H 21 100 110 haplotype_1 +H 21 110 125 haplotype_2 -#H ancestry s Local ancestry -#H beta .2f Effect size in linear model - -#R beta .2f Effect size in linear model - -H 21 26928472 26941960 chr21.q.3365*1 ASW 0.73 -R 21 26938353 26938400 21_26938353_STR 0.45 - -H 21 26938989 26941960 chr21.q.3365*10 CEU 0.30 -H 21 26938353 26938989 chr21.q.3365*11 MXL 0.49 - -V chr21.q.3365*1 26928472 26928472 21_26928472_C_A C -V chr21.q.3365*1 26938353 26938353 21_26938353_T_C T -V chr21.q.3365*1 26940815 26940815 21_26940815_T_C C -V chr21.q.3365*1 26941960 26941960 21_26941960_A_G G - -V chr21.q.3365*10 26938989 26938989 21_26938989_G_A A -V chr21.q.3365*10 26940815 26940815 21_26940815_T_C T -V chr21.q.3365*10 26941960 26941960 21_26941960_A_G A - -V chr21.q.3365*11 26938353 26938353 21_26938353_T_C T -V chr21.q.3365*11 26938989 26938989 21_26938989_G_A A +V haplotype_1 100 101 variant_1 C +V haplotype_2 110 111 variant_2 A diff --git a/tests/test_validate.py b/tests/test_validate.py index 11275f4f..a57a1b80 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -23,7 +23,7 @@ def test_with_empty_lines(): """ Tests a .hap with empty lines """ - assert is_hapfile_valid(DATADIR / "empty_lines.hap") + assert not is_hapfile_valid(DATADIR / "empty_lines.hap") def test_with_out_of_header_metas_sorted(): From 6288b8d0af27ca77430a7fab1178d10ff86c90fd Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 13 Oct 2023 21:34:22 -0700 Subject: [PATCH 38/44] add test for whitespace --- tests/test_validate.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/test_validate.py b/tests/test_validate.py index a57a1b80..b4c5f193 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -253,6 +253,44 @@ def test_unreadable_hapfile(): assert not is_hapfile_valid(Path("NON_EXISTENT_FILENAME.hap")) +def test_leading_trailing_whitespace(): + """ + We should fail if lines have any leading or trailing whitespace + """ + basic = DATADIR / "basic.hap" + with open(basic, "r") as basic_file: + basic_lines = basic_file.readlines() + + temp_basic = DATADIR / "leading_trailing_whitespace.hap" + lines = (0, 2, 3, 5) + + # test both kinds of whitespace: tabs and spaces + for space_kind in (" ", "\t"): + # test leading whitespace + for line in lines: + with open(temp_basic, "w") as temp_basic_file: + new_lines = basic_lines.copy() + new_lines[line] = space_kind + new_lines[line] + temp_basic_file.writelines(new_lines) + assert not is_hapfile_valid(temp_basic) + # test trailing whitespace + for line in lines: + with open(temp_basic, "w") as temp_basic_file: + new_lines = basic_lines.copy() + new_lines[line] = new_lines[line][:-1] + space_kind + "\n" + temp_basic_file.writelines(new_lines) + assert not is_hapfile_valid(temp_basic) + + # also try adding a space next to a tab + with open(temp_basic, "w") as temp_basic_file: + new_lines = basic_lines.copy() + new_lines[1] = new_lines[1][:2] + " " + new_lines[1][2:] + temp_basic_file.writelines(new_lines) + assert not is_hapfile_valid(temp_basic) + + temp_basic.unlink() + + def test_basic(capfd): hp_file = DATADIR / "basic.hap" From 0b0932ce5c390f20b949c7665f3c427eb4f399d9 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 14 Oct 2023 09:40:15 -0700 Subject: [PATCH 39/44] add test for indexed hap file --- tests/data/validate/basic.hap.gz | Bin 0 -> 137 bytes tests/data/validate/basic.hap.gz.tbi | Bin 0 -> 148 bytes tests/test_validate.py | 7 +++++++ 3 files changed, 7 insertions(+) create mode 100644 tests/data/validate/basic.hap.gz create mode 100644 tests/data/validate/basic.hap.gz.tbi diff --git a/tests/data/validate/basic.hap.gz b/tests/data/validate/basic.hap.gz new file mode 100644 index 0000000000000000000000000000000000000000..30f9ed97129a1f7c45db059fb4b248da54401fc4 GIT binary patch literal 137 zcmb2|=3rp}f&Xj_PR>jWISj$ibi8%7&Yd~`*yMuoCF2L59+((CF)=ZCU}9qO;EadP ziIe&(bv$~_Tf7)pd3${(s`TtQG-0aa7Ac9MX{utWYR{%JxXuX-k#Ia?;IhKtQOOe~ Xh9e=?2it&l$fH>%&A<${8$jW@g8w& Date: Sat, 14 Oct 2023 09:43:41 -0700 Subject: [PATCH 40/44] start adding docstrings --- docs/commands/validate.rst | 6 +- haptools/__main__.py | 6 ++ haptools/transform.py | 2 +- haptools/validate.py | 120 ++++++++++++++++++++++++++++++++++++- 4 files changed, 127 insertions(+), 7 deletions(-) diff --git a/docs/commands/validate.rst b/docs/commands/validate.rst index a4e822f8..e4ac5902 100644 --- a/docs/commands/validate.rst +++ b/docs/commands/validate.rst @@ -4,7 +4,7 @@ validate ======== -Validate the formatting of a sorted :doc:`.hap file `. Output warnings/errors explaining how the formatting of your ``.hap`` file may be improved. +Validate the formatting of a :doc:`.hap file `. Output warnings/errors explaining how the formatting of your ``.hap`` file may be improved. If a :ref:`.pvar file ` file is provided, the SNPs and TRs present in the ``.hap`` file will be checked for existence in the ``.pvar`` file. @@ -46,8 +46,8 @@ All warnings and errors will be logged if there are any. [ INFO] Completed .hap file validation with 0 errors and 1 warnings. Error: Found several warnings and / or errors in the .hap file -All ``.hap`` files must be sorted before they can be validated. By default, we try our best to sort your ``.hap`` file internally before performing any validation checks. -If your ``.hap`` file is already sorted, you should use the ``--sorted`` parameter. It will speed things up a bit by skipping the sorting step. If your ``.hap`` file is indexed, it will be assumed to be sorted. +All ``.hap`` files must be sorted before they can be validated, so we try our best to sort your ``.hap`` file internally before performing any validation checks. +If your ``.hap`` file is already sorted, you should use the ``--sorted`` parameter. It will speed things up a bit by skipping the sorting step. If your ``.hap`` file is indexed, it will be assumed to be sorted regardless. .. code-block:: bash diff --git a/haptools/__main__.py b/haptools/__main__.py index 5e5675b2..d67bfe34 100755 --- a/haptools/__main__.py +++ b/haptools/__main__.py @@ -1057,6 +1057,12 @@ def validate( genotypes: Path | None = None, verbosity: str = "INFO", ): + """ + Validate the formatting of a .hap file + + Output warnings/errors explaining how the formatting of your .hap file may + be improved. + """ from .logging import getLogger from .validate import is_hapfile_valid diff --git a/haptools/transform.py b/haptools/transform.py index df70f6cb..807dca28 100644 --- a/haptools/transform.py +++ b/haptools/transform.py @@ -175,7 +175,7 @@ class GenotypesAncestry(data.GenotypesVCF): See documentation for :py:attr:`~.Genotypes.log` """ - def __init__(self, fname: Path | str, log: Logger = None): + def __init__(self, fname: Path | str, log: logging.Logger = None): super().__init__(fname, log) self.ancestry = None self.valid_labels = None diff --git a/haptools/validate.py b/haptools/validate.py index 4fb89ff0..0daad0b7 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -2,7 +2,6 @@ import os import logging - from re import search from pathlib import Path @@ -18,6 +17,20 @@ def tmpex(expectation: object, received: object) -> str: class Line: + """ + A line in the file + + Attributes + ---------- + columns : list[str] + The line split into separate columns + content : str + The content of the line as a string + number : int + The line number + count : int + The number of columns in this line + """ def __init__(self, content: str, number: int): self.content: str = content self.number: int = number @@ -26,18 +39,64 @@ def __init__(self, content: str, number: int): self.count: int = len(self.columns) def __getitem__(self, index: int) -> str: + """ + Index into the columns of the line + + Parameters + ---------- + index : int + The index into the line. Must be less than :py:attr:`~.Line.count` + + Returns + ------- + str + The column at this index + """ return self.columns[index] def __str__(self) -> str: + """ + Retrieve the line as a string + + Returns + ------- + str + The line + """ return self.content class HapFileIO: - def __init__(self, filename: Path, logger=None): + """ + Process lines from .hap files + + Attributes + ---------- + filename : Path + The path to the file + logger : logging.Logger, optional + A logging instance for recording errors/warnings statements + """ + def __init__(self, filename: Path, logger: logging.Logger = None): self.filename = filename self.log = logger or logging.getLogger(self.__class__.__name__) def lines(self, sorted: bool = False) -> list[Line]: + """ + Retrieve the lines of the file as Line instances + + Sort the lines if they're unsorted + + Parameters + ---------- + sorted : bool, optional + Whether the file can be assumed to be sorted + + Returns + ------- + list[Line] + The lines of the file + """ buffer = open(self.filename) content = [ @@ -64,6 +123,14 @@ def lines(self, sorted: bool = False) -> list[Line]: return content def validate_existence(self) -> bool: + """ + Check whether the .hap file exists and can be read + + Returns + ------- + bool + True if it exists and False otherwise + """ if not self.exists(): self.log.error(f"The file {self.filename} does not exist.") return False @@ -81,12 +148,38 @@ def validate_existence(self) -> bool: return is_ok def exists(self) -> bool: + """ + Check if the file exists + + Returns + ------- + bool + True if it exists and False otherwise + """ return self.filename.exists() def is_regular(self): + """ + Check if the file can be opened by python + + Symlinks are also allowed + + Returns + ------- + bool + True if it can be opened and False otherwise + """ return self.filename.is_file() def is_readable(self) -> bool: + """ + Check if the file can be read by python + + Returns + ------- + bool + True if it can be read and False otherwise + """ return os.access(self.filename, os.R_OK) @@ -122,7 +215,7 @@ class HapFileValidator: KEY_ID: str = "HT::ID" KEY_ALLELE: str = "HT::Allele" - def __init__(self, logger=None): + def __init__(self, logger: logging.Logger = None): self.log = logger or logging.getLogger(self.__class__.__name__) self.vars_ex: dict[int, dict[str, type]] = { @@ -834,6 +927,27 @@ def is_hapfile_valid( max_variants: int = 10000, log: logging.Logger = None, ) -> bool: + """ + Checks whether a file is properly formatted + + Logs suggestions (warnings and errors) if it isn't + + Parameters + ---------- + filename : Path + The path to the file + sorted : bool, optional + Whether the file can be assumed to be sorted already + pvar : Path, optional + Path to a PVAR file with SNPs from the .hap file + log: logging.Logger, optional + A logging module to which to write messages about progress and any errors + + Returns + ------- + bool + True if the file is formatted correctly and False otherwise + """ if log == None: log = getLogger(name=LOGGER_NAME, level="CRITICAL") From 189eed07a82962f0d1f3f72d23a6fc129ec05ef5 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 14 Oct 2023 09:44:11 -0700 Subject: [PATCH 41/44] remove max_variants which we will instead infer from the hap file --- haptools/validate.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/haptools/validate.py b/haptools/validate.py index 0daad0b7..a415335b 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -923,8 +923,7 @@ def warnskip(self, line: Line): def is_hapfile_valid( filename: Path, sorted: bool = False, - pvar: Path | None = None, - max_variants: int = 10000, + pvar: Path = None, log: logging.Logger = None, ) -> bool: """ @@ -974,10 +973,13 @@ def is_hapfile_valid( hapfile.validate_version_declarations() + variants = set() + if pvar is not None: varfile = GenotypesPLINK(pvar.with_suffix(".pgen")) - varfile.read_variants(max_variants=max_variants) + varfile.read_variants(variants=variants) + # TODO: do this quicker by just checking whether the sets intersect ids = list(map(lambda v: v[0], varfile.variants)) hapfile.compare_haps_to_pvar(ids) From c042b8206b18814d8ae8448ce5795f525c00c20c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sun, 29 Oct 2023 11:20:38 -0700 Subject: [PATCH 42/44] start HapFileValidator class commenting --- haptools/validate.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/haptools/validate.py b/haptools/validate.py index a415335b..9acf6323 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -68,7 +68,7 @@ def __str__(self) -> str: class HapFileIO: """ - Process lines from .hap files + Process lines from a .hap file Attributes ---------- @@ -184,6 +184,32 @@ def is_readable(self) -> bool: class HapFileValidator: + """ + Validate lines from a .hap file + + Attributes + ---------- + log : logging.Logger, optional + A logging instance for recording errors/warnings statements + vars_ex : dict[int, dict[str, type]] + TODO + types_ex : dict[int, list[type]] + TODO + meta : list[Line] + TODO + data : dict[int, list[Line]] + TODO + hrids : dict[int, dict[str, Line]] + TODO + vrids : dict[str, dict[str, Line]] + TODO + referenced_chromosomes : set[str] + TODO + errc : int + TODO + warc : int + TODO + """ # H CHROM START END ID MANDATORY_HAPLOTYPE_COLUMN_COUNT: int = 5 From 35587643165223afe80af5ab94c5358f104e34b8 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 11 Nov 2023 10:01:02 -0800 Subject: [PATCH 43/44] add more comments to validate command --- haptools/validate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/haptools/validate.py b/haptools/validate.py index 9acf6323..5cc385c4 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -192,11 +192,11 @@ class HapFileValidator: log : logging.Logger, optional A logging instance for recording errors/warnings statements vars_ex : dict[int, dict[str, type]] - TODO + The names of each of the extra columns for each of the line types types_ex : dict[int, list[type]] - TODO + The types of each of the extra columns for each of the line types meta : list[Line] - TODO + The metadata lines in the file data : dict[int, list[Line]] TODO hrids : dict[int, dict[str, Line]] From d91b2a38b0f4f8deaa94d8d8ec7191aab03b43bf Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Fri, 23 Feb 2024 21:39:11 +0000 Subject: [PATCH 44/44] document metadata line handling code --- haptools/validate.py | 79 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 11 deletions(-) diff --git a/haptools/validate.py b/haptools/validate.py index 5cc385c4..c9baec35 100644 --- a/haptools/validate.py +++ b/haptools/validate.py @@ -192,23 +192,28 @@ class HapFileValidator: log : logging.Logger, optional A logging instance for recording errors/warnings statements vars_ex : dict[int, dict[str, type]] - The names of each of the extra columns for each of the line types + The names of each of the extra columns for each of the line types. The keys of + the outer dict encode each line type and the keys of the inner dict encode each + extra column types_ex : dict[int, list[type]] - The types of each of the extra columns for each of the line types - meta : list[Line] + The types of each of the extra columns for each of the line types. The keys of + the outer dict encode each line type and the keys of the inner dict encode each + extra column + meta_lines : list[Line] The metadata lines in the file data : dict[int, list[Line]] - TODO + A list of the lines, delineated by their line type (as the keys to the dict) hrids : dict[int, dict[str, Line]] - TODO + Each haplotype and repeat line, keyed by its ID. The outer dictionary encodes + line types vrids : dict[str, dict[str, Line]] - TODO + Each variant line, keyed by its ID. The outer dictionary encodes line types referenced_chromosomes : set[str] - TODO + A running list of the chromosomes that have been seen errc : int - TODO + A running count of the errors we've seen warc : int - TODO + A running count of the warnings we've seen """ # H CHROM START END ID MANDATORY_HAPLOTYPE_COLUMN_COUNT: int = 5 @@ -256,7 +261,7 @@ def __init__(self, logger: logging.Logger = None): HapFileValidator.KEY_VARIANT: [], } - self.meta: list[Line] = [] + self.meta_lines: list[Line] = [] self.data: dict[int, list[Line]] = { HapFileValidator.KEY_HAPLOTYPE: [], HapFileValidator.KEY_REPEAT: [], @@ -276,18 +281,45 @@ def __init__(self, logger: logging.Logger = None): self.warc: int = 0 def extract_and_store_content(self, file: HapFileIO, sorted: bool = False): + """ + Extract the header and data lines of a HapFileIO instance + + Parameters + ---------- + file : HapFileIO + The file object to extract and store content from. + sorted : bool, optional + Flag indicating whether the lines are already sorted + """ lines = file.lines(sorted=sorted) self.extract_meta_lines(lines) self.extract_data_lines(lines) def extract_meta_lines(self, lines: list[Line]): + """ + Identify header lines in the file + + Parameters + ---------- + lines : list[Line] + The full set of lines, from which the header lines must be extracted + """ header_limit = next( i for i, line in enumerate(lines) if not line[0].startswith("#") ) self.meta_lines = lines[:header_limit] def extract_data_lines(self, lines: list[Line]): + """ + Identify non-header lines and categorize them based on their field type. + + Parameters + ---------- + lines : list[Line] + The full set of lines from the file + """ + # TODO: do not encode H, R, or V here but somewhere global ln = [ [ln for ln in lines if ln[0].startswith("H")], [ln for ln in lines if ln[0].startswith("R")], @@ -309,6 +341,13 @@ def extract_data_lines(self, lines: list[Line]): # def validate_version_declarations(self): + """ + Confirm that the version declaration is in the correct format + + This method extracts the version declaration and checks it's in the correct + format. If no version declarations are found, we assume the latest version and + issue a warning. + """ versions = self.extract_version_declarations() if len(versions) == 0: self.log.warning( @@ -320,6 +359,17 @@ def validate_version_declarations(self): self.validate_version_format(versions[-1]) def extract_version_declarations(self) -> list[Line]: + """ + Extracts version declarations from the meta lines + + Issues warnings for each version declaration after the first, since there + should only ever be one. + + Returns + ------- + list[Line] + A list of version declarations as Line objects + """ decls = list( filter(lambda x: x.count > 1 and x[1] == "version", self.meta_lines) ) @@ -337,6 +387,14 @@ def extract_version_declarations(self) -> list[Line]: return decls def validate_version_format(self, version: Line): + """ + Validates the format of the version declaration + + Parameters + ---------- + version: Line + The line containing the version declaration + """ if version.count < 3: self.leexfl( "Not enough columns in version declaration", @@ -356,7 +414,6 @@ def validate_version_format(self, version: Line): version[2], version, ) - self.errc += 1 #