From 874567bda2dbea9a8586cf1ac464eae4bc9d1ef5 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 10 Sep 2025 09:00:28 +0100 Subject: [PATCH 01/44] skip atypical assemblies --- flows/parsers/parse_ncbi_assemblies.py | 28 +++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index f9814f1..fa3651c 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -76,6 +76,28 @@ def fetch_ncbi_datasets_sequences( yield json.loads(line) +def is_atypical_assembly(report: dict, parsed: dict) -> bool: + """ + Check if an assembly is atypical. + + Args: + report (dict): A dictionary containing the assembly information. + parsed (dict): A dictionary containing parsed data. + + Returns: + bool: True if the assembly is atypical, False otherwise. + """ + if "assemblyInfo" not in report: + return True + if report["assemblyInfo"].get("atypical", {}).get("isAtypical", False): + # delete from parsed if present + accession = report["accession"] + if accession in parsed: + del parsed[accession] + return True + return False + + def process_assembly_report( report: dict, previous_report: Optional[dict], config: Config, parsed: dict ) -> dict: @@ -98,6 +120,8 @@ def process_assembly_report( Returns: dict: The updated report dictionary. """ + if is_atypical_assembly(report, parsed): + return None processed_report = {**report, "processedAssemblyInfo": {"organelle": "nucleus"}} if "pairedAccession" in report: if processed_report["pairedAccession"].startswith("GCF_"): @@ -336,7 +360,9 @@ def process_assembly_reports( processed_report = process_assembly_report( report, previous_report, config, parsed ) - if use_previous_report(processed_report, parsed, config): + if processed_report is None or use_previous_report( + processed_report, parsed, config + ): continue fetch_and_parse_sequence_report(processed_report) append_features(processed_report, config) From 2e7c0272463306b6c6f1764fd6cc5a2546eba488 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 10 Sep 2025 09:29:10 +0100 Subject: [PATCH 02/44] include line number when reporting exceptions --- flows/parsers/parse_ncbi_assemblies.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index fa3651c..7462bf6 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -371,7 +371,11 @@ def process_assembly_reports( previous_report = processed_report except Exception as e: print( - f"Error processing report for {report.get('accession', 'unknown')}: {e}" + ( + f"Error processing report for " + f"{report.get('accession', 'unknown')}: " + f"{e} (line {e.__traceback__.tb_lineno})" + ) ) continue From f419bd08fe735eb8d7a2fdb0849e754c76c64e19 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 10 Sep 2025 09:36:28 +0100 Subject: [PATCH 03/44] include full exception --- flows/parsers/parse_ncbi_assemblies.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index 7462bf6..d20012d 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -8,6 +8,7 @@ from collections import defaultdict from glob import glob from os.path import abspath, dirname +from traceback import format_exc from typing import Generator, Optional from genomehubs import utils as gh_utils @@ -377,6 +378,7 @@ def process_assembly_reports( f"{e} (line {e.__traceback__.tb_lineno})" ) ) + print(format_exc(e)) continue From 3cedc5885696d7a559d2a61a870bd4c6a8435cb6 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 10 Sep 2025 09:40:15 +0100 Subject: [PATCH 04/44] add more debugging --- flows/parsers/parse_ncbi_assemblies.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index d20012d..8dbbc21 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -240,6 +240,8 @@ def add_report_to_parsed_reports( continue linked_row = parsed[acc] if accession not in linked_row["linkedAssembly"]: + print(f"Linking {accession} to {acc}") + print(linked_row["linkedAssembly"]) linked_row["linkedAssembly"].append(accession) if acc not in row["linkedAssembly"]: row["linkedAssembly"].append(acc) From 8ac1f84cb51292320eb72e7391c337dbbe7442dd Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 10 Sep 2025 09:42:57 +0100 Subject: [PATCH 05/44] ensure linkedAssembly is not an empty string --- flows/parsers/parse_ncbi_assemblies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index 8dbbc21..a4f7be4 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -240,8 +240,8 @@ def add_report_to_parsed_reports( continue linked_row = parsed[acc] if accession not in linked_row["linkedAssembly"]: - print(f"Linking {accession} to {acc}") - print(linked_row["linkedAssembly"]) + if not linked_row["linkedAssembly"]: + linked_row["linkedAssembly"] = [] linked_row["linkedAssembly"].append(accession) if acc not in row["linkedAssembly"]: row["linkedAssembly"].append(acc) From 1d22f9cfc3ded3d458ed823fbedae7742d57e0b4 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 10 Sep 2025 09:54:08 +0100 Subject: [PATCH 06/44] ensure linked assembly is a list --- flows/parsers/parse_ncbi_assemblies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index a4f7be4..c6c9ec6 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -240,7 +240,7 @@ def add_report_to_parsed_reports( continue linked_row = parsed[acc] if accession not in linked_row["linkedAssembly"]: - if not linked_row["linkedAssembly"]: + if not isinstance(linked_row["linkedAssembly"], list): linked_row["linkedAssembly"] = [] linked_row["linkedAssembly"].append(accession) if acc not in row["linkedAssembly"]: From fda4b334732ba08908e2255b78d042602962d4ff Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 10 Sep 2025 09:56:15 +0100 Subject: [PATCH 07/44] tidy debugging code --- flows/parsers/parse_ncbi_assemblies.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index c6c9ec6..f2fad6c 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -8,7 +8,6 @@ from collections import defaultdict from glob import glob from os.path import abspath, dirname -from traceback import format_exc from typing import Generator, Optional from genomehubs import utils as gh_utils @@ -380,7 +379,6 @@ def process_assembly_reports( f"{e} (line {e.__traceback__.tb_lineno})" ) ) - print(format_exc(e)) continue From eebe07f991de5e9044dbc3b3a322e7ad269e3fdf Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 10 Sep 2025 13:51:48 +0100 Subject: [PATCH 08/44] add update flows for ncbi taxonomy and tolid prefixes --- .vscode/settings.json | 5 +- flows/lib/utils.py | 132 ++++++++++++++++++++++- flows/updaters/update_ncbi_taxonomy.py | 136 ++++++++++++++++++++++++ flows/updaters/update_tolid_prefixes.py | 126 ++++++++++++++++++++++ 4 files changed, 397 insertions(+), 2 deletions(-) create mode 100644 flows/updaters/update_ncbi_taxonomy.py create mode 100644 flows/updaters/update_tolid_prefixes.py diff --git a/.vscode/settings.json b/.vscode/settings.json index a656443..292308e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -12,5 +12,8 @@ "yaml.schemas": { "swaggerviewer:openapi": "file:///Users/rchallis/projects/genomehubs/genomehubs/src/genomehubs-api/src/api-v2.yaml" }, - "restructuredtext.pythonRecommendation.disabled": true + "restructuredtext.pythonRecommendation.disabled": true, + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda", + "python-envs.pythonProjects": [] } diff --git a/flows/lib/utils.py b/flows/lib/utils.py index 9149fc3..6bb8836 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -1,6 +1,8 @@ #!/usr/bin/python3 import contextlib +import hashlib +import os from argparse import Action from csv import DictReader, Sniffer from datetime import datetime @@ -9,6 +11,8 @@ import boto3 import requests +from botocore.exceptions import ClientError +from dateutil import parser from genomehubs import utils as gh_utils @@ -640,4 +644,130 @@ def parse_tsv(text: str) -> List[Dict[str, str]]: sniffer = Sniffer() dialect = sniffer.sniff(text) reader = DictReader(StringIO(text), dialect=dialect) - return [row for row in reader] + return list(reader) + + +def last_modified_git_remote(http_path: str) -> Optional[str]: + """ + Get the last modified date of a file in a git repository. + + Args: + http_path (str): Path to the HTTP file. + + Returns: + str: Last modified date of the file. + """ + project_path = http_path.removeprefix("https://gitlab.com/") + project_path = project_path.removesuffix(".git") + parts = project_path.split("/") + project = "%2F".join(parts[:2]) + ref = parts[4] + file = "%2F".join(parts[5:]).split("?")[0] + project_path = project_path.replace("/", "%2F") + api_url = ( + f"https://gitlab.com/api/v4/projects/{project}/repository/commits" + f"?ref_name={ref}&path={file}&per_page=1" + ) + response = requests.get(api_url) + if response.status_code == 200: + if commits := response.json(): + if committed_date := commits[0].get("committed_date", None): + dt = parser.isoparse(committed_date) + return int(dt.timestamp()) + else: + response = requests.head(http_path, allow_redirects=True) + if response.status_code == 200: + return response.headers.get("Last-Modified", None) + return None + + +def last_modified_http(http_path: str) -> Optional[str]: + """ + Get the last modified date of a file. + + Args: + http_path (str): Path to the HTTP file. + + Returns: + str: Last modified date of the file. + """ + if "gitlab.com" in http_path: + return last_modified_git_remote(http_path) + response = requests.head(http_path, allow_redirects=True) + if response.status_code == 200: + if last_modified := response.headers.get("Last-Modified", None): + dt = parser.parse(last_modified) + return int(dt.timestamp()) + return None + return None + + +def last_modified_s3(s3_path: str) -> Optional[str]: + """ + Get the last modified date of a file on S3. + + Args: + s3_path (str): Path to the remote file on s3. + + Returns: + str: Last modified date of the file. + """ + s3 = boto3.client("s3") + + # Extract bucket name and key from the S3 path + def parse_s3_path(s3_path): + bucket, key = s3_path.removeprefix("s3://").split("/", 1) + return bucket, key + + bucket, key = parse_s3_path(s3_path) + + # Return None if the remote file does not exist + try: + response = s3.head_object(Bucket=bucket, Key=key) + last_modified = response.get("LastModified", None) + return int(last_modified.timestamp()) if last_modified is not None else None + except ClientError: + return None + + +def last_modified(local_path: str) -> Optional[str]: + """ + Get the last modified date of a local file. + + Args: + local_path (str): Path to the local file. + + Returns: + str: Last modified date of the file. + """ + if not os.path.exists(local_path): + return None + mtime = os.path.getmtime(local_path) + return int(mtime) + + +def is_local_file_current_http(local_path: str, http_path: str) -> bool: + """ + Compare the last modified date of a local file with a remote file on HTTP. + + Args: + local_path (str): Path to the local file. + http_path (str): Path to the HTTP directory. + + Returns: + bool: True if the local file is up-to-date, False otherwise. + """ + local_date = last_modified(local_path) + remote_date = last_modified_http(http_path) + print(f"Local date: {local_date}, Remote date: {remote_date}") + if local_date is None or remote_date is None: + return False + return local_date >= remote_date + + +def generate_md5(file_path): + hash_md5 = hashlib.md5() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() diff --git a/flows/updaters/update_ncbi_taxonomy.py b/flows/updaters/update_ncbi_taxonomy.py new file mode 100644 index 0000000..1e37a37 --- /dev/null +++ b/flows/updaters/update_ncbi_taxonomy.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 + +import os +import subprocess +import sys +from os.path import abspath, dirname + +if __name__ == "__main__" and __package__ is None: + sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + __package__ = "flows" + +from flows.lib.conditional_import import emit_event, flow, task +from flows.lib.shared_args import ( + OUTPUT_PATH, + ROOT_TAXID, + S3_PATH, + default, + parse_args, + required, +) +from flows.lib.utils import generate_md5, is_local_file_current_http + + +@task(retries=2, retry_delay_seconds=2, log_prints=True) +def fetch_ncbi_taxonomy( + root_taxid: str, + local_path: str, + http_path: str = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz", +) -> bool: + """ + Fetch the NCBI taxonomy dump and filter by root taxon if specified. + + Args: + root_taxid (str): Root taxon ID to filter by. + http_path (str): URL to fetch the taxonomy dump from. + local_path (str): Path to save the taxonomy dump. + + Returns: + bool: True if the fetched file matches the remote version, False otherwise. + """ + # create local_path if it doesn't exist + os.makedirs(local_path, exist_ok=True) + local_gz_file = f"{local_path}/taxdump.tar.gz" + # Fetch the remote file + cmd = ["curl", "-sSL", http_path, "-o", local_gz_file] + print(f"Running command: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + + remote_md5_path = f"{http_path}.md5" + # Fetch the remote MD5 checksum + cmd = ["curl", "-sSL", remote_md5_path] + print(f"Running command: {' '.join(cmd)}") + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + remote_md5 = result.stdout.split()[0] + + # Calculate the local MD5 checksum + local_md5 = generate_md5(local_gz_file) + print(f"Local MD5: {local_md5}, Remote MD5: {remote_md5}") + + if local_md5 != remote_md5: + print("MD5 checksums do not match. The file may be corrupted.") + return False + + # extract the tar.gz file + cmd = ["tar", "-xzf", local_gz_file, "-C", local_path] + print(f"Running command: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + + # set the timestamp of extracted files to match the tar.gz file + gz_mtime = os.path.getmtime(local_gz_file) + for fname in os.listdir(local_path): + fpath = os.path.join(local_path, fname) + if os.path.isfile(fpath): + os.utime(fpath, (gz_mtime, gz_mtime)) + + # remove the tar.gz file + os.remove(local_gz_file) + + return True + + +@task(log_prints=True) +def taxonomy_is_up_to_date(local_path: str, http_path: str) -> bool: + """ + Check if the local NCBI taxonomy file is up-to-date with the remote file. + + Args: + local_path (str): Path to the local file. + http_path (str): Path to the remote file on HTTP. + + Returns: + bool: True if the local file is up-to-date, False otherwise. + """ + return is_local_file_current_http(f"{local_path}/nodes.dmp", http_path) + + +@flow() +def update_ncbi_taxonomy(root_taxid: str, output_path: str, s3_path: str) -> None: + """Fetch and optionally update the NCBI taxonomy dump. + + Args: + root_taxid (str): Root taxon ID to filter by. + output_path (str): Path to save the taxonomy dump. + s3_path (str): S3 path to compare with. + """ + http_path = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" + status = None + if taxonomy_is_up_to_date(output_path, http_path): + status = True + else: + status = False + fetch_ncbi_taxonomy( + local_path=output_path, http_path=http_path, root_taxid=root_taxid + ) + print(f"Taxonomy update status: {status}") + + emit_event( + event="update.ncbi.taxonomy.finished", + resource={ + "prefect.resource.id": f"fetch.taxonomy.{output_path}", + "prefect.resource.type": "ncbi.taxonomy", + "prefect.resource.matches.previous": "yes" if status else "no", + }, + payload={"matches_previous": status}, + ) + return status + + +if __name__ == "__main__": + """Run the flow.""" + args = parse_args( + [default(ROOT_TAXID, "taxon"), required(OUTPUT_PATH), S3_PATH], + "Fetch NCBI taxdump and optionally filter by root taxon.", + ) + + update_ncbi_taxonomy(**vars(args)) diff --git a/flows/updaters/update_tolid_prefixes.py b/flows/updaters/update_tolid_prefixes.py new file mode 100644 index 0000000..679be7f --- /dev/null +++ b/flows/updaters/update_tolid_prefixes.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +import os +import subprocess +import sys +from os.path import abspath, dirname + +if __name__ == "__main__" and __package__ is None: + sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + __package__ = "flows" + +from flows.lib.conditional_import import emit_event, flow, task +from flows.lib.shared_args import ( + OUTPUT_PATH, + ROOT_TAXID, + S3_PATH, + default, + parse_args, + required, +) +from flows.lib.utils import is_local_file_current_http + + +@task(retries=2, retry_delay_seconds=2, log_prints=True) +def fetch_tolid_prefixes( + root_taxid: str, + local_path: str, + http_path: str = ( + "https://gitlab.com/wtsi-grit/darwin-tree-of-life-sample-naming/" + "-/raw/master/tolids.txt?ref_type=heads" + ), + min_lines: int = 400000, +) -> bool: + """ + Fetch the ToLID prefix file and filter by root taxon if specified. + + Args: + root_taxid (str): Root taxon ID to filter by. + http_path (str): URL to fetch the ToLID prefix file from. + local_path (str): Path to save the ToLID prefix file. + + Returns: + bool: True if the fetched file matches the remote version, False otherwise. + """ + # create local_path if it doesn't exist + os.makedirs(local_path, exist_ok=True) + local_file = f"{local_path}/tolids.txt" + # Fetch the remote file + cmd = ["curl", "-sSL", http_path, "-o", local_file] + print(f"Running command: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + + # check the number of lines in the file + with open(local_file, "r") as f: + num_lines = sum(1 for _ in f) + if num_lines < min_lines: + print(f"File has too few lines: {num_lines} < {min_lines}") + return False, num_lines + + return True, num_lines + + +@task(log_prints=True) +def tolid_file_is_up_to_date(local_path: str, http_path: str) -> bool: + """ + Check if the local ToLID prefixes file is up-to-date with the remote file. + + Args: + local_path (str): Path to the local file. + http_path (str): Path to the remote file on HTTP. + + Returns: + bool: True if the local file is up-to-date, False otherwise. + """ + return is_local_file_current_http(f"{local_path}/tolids.txt", http_path) + + +@flow() +def update_tolid_prefixes(root_taxid: str, output_path: str, s3_path: str) -> None: + """Fetch and optionally update the ToLID prefixes file. + + Args: + root_taxid (str): Root taxon ID to filter by. + output_path (str): Path to save the taxonomy dump. + s3_path (str): S3 path to compare with. + """ + http_path = ( + "https://gitlab.com/wtsi-grit/darwin-tree-of-life-sample-naming/" + "-/raw/master/tolids.txt?ref_type=heads" + ) + status = None + complete = False + if tolid_file_is_up_to_date(output_path, http_path): + status = True + complete = True + line_count = 0 + with open(f"{output_path}/tolids.txt", "r") as f: + line_count = sum(1 for _ in f) + else: + status = False + complete, line_count = fetch_tolid_prefixes( + local_path=output_path, http_path=http_path, root_taxid=root_taxid + ) + print(f"TolID file matches previous: {status}") + + if complete: + emit_event( + event="update.tolid.prefixes.finished", + resource={ + "prefect.resource.id": f"fetch.tolid.prefixes.{output_path}", + "prefect.resource.type": "tolid.prefixes", + "prefect.resource.matches.previous": "yes" if status else "no", + }, + payload={"matches_previous": status, "line_count": line_count}, + ) + return status + + +if __name__ == "__main__": + """Run the flow.""" + args = parse_args( + [default(ROOT_TAXID, "taxon"), required(OUTPUT_PATH), S3_PATH], + "Fetch ToLID prefixes and optionally filter by root taxon.", + ) + + update_tolid_prefixes(**vars(args)) From ac216911d6096ac144189642aad345552a4fa9a6 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Thu, 11 Sep 2025 10:05:18 +0100 Subject: [PATCH 09/44] add ott and ena taxonomy updaters --- flows/updaters/update_ena_taxonomy_extra.py | 172 ++++++++++++++++++ flows/updaters/update_ott_taxonomy.py | 182 ++++++++++++++++++++ flows/updaters/update_tolid_prefixes.py | 1 + 3 files changed, 355 insertions(+) create mode 100644 flows/updaters/update_ena_taxonomy_extra.py create mode 100644 flows/updaters/update_ott_taxonomy.py diff --git a/flows/updaters/update_ena_taxonomy_extra.py b/flows/updaters/update_ena_taxonomy_extra.py new file mode 100644 index 0000000..3c06976 --- /dev/null +++ b/flows/updaters/update_ena_taxonomy_extra.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 + +import json +import os +import sys +from os.path import abspath, dirname +from urllib.request import urlopen + +from tqdm import tqdm + +if __name__ == "__main__" and __package__ is None: + sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + __package__ = "flows" + +from flows.lib.conditional_import import emit_event, flow, task +from flows.lib.shared_args import ( + APPEND, + OUTPUT_PATH, + ROOT_TAXID, + S3_PATH, + TAXDUMP_PATH, + default, + parse_args, + required, +) + + +@task() +def read_ncbi_tax_ids(taxdump_path: str) -> set[str]: + """Read NCBI tax IDs from the taxdump nodes file.""" + print(f"Reading NCBI taxids from {taxdump_path}") + tax_ids = set() + nodes_file = os.path.join(taxdump_path, "nodes.dmp") + with open(nodes_file, "r") as f: + for line in f: + fields = line.strip().split("\t") + if len(fields) > 1: + tax_ids.add(fields[0]) + return tax_ids + + +@task() +def add_jsonl_tax_ids(jsonl_path: str, tax_ids: set[str]) -> None: + print(f"Reading previously fetched ENA taxids from {jsonl_path}") + filtered_path = f"{jsonl_path}.filtered" + try: + with open(jsonl_path, "r") as f, open(filtered_path, "w") as f_out: + for line in f: + data = json.loads(line) + tax_id = data["taxId"] + if tax_id not in tax_ids: + f_out.write(line) + tax_ids.add(tax_id) + os.replace(filtered_path, jsonl_path) + except Exception as e: + print(f"Error reading {jsonl_path}: {e}") + exit() + + +@task(log_prints=True) +def get_ena_api_new_taxids(root_taxid: str, existing_tax_ids: set[str]) -> set[str]: + print(f"Fetching new taxids for tax_tree({root_taxid}) from ENA API") + + limit = 10000000 + url = ( + f"https://www.ebi.ac.uk/ena/portal/api/search?result=taxon" + f"&query=tax_tree({root_taxid})&limit={limit}" + ) + + # Stream the content of the URL + column_index = None + new_tax_ids = set() + with urlopen(url) as response: + for line in response: + columns = line.decode("utf-8").strip().split("\t") + if column_index is None: + column_index = 0 if columns[0] == "tax_id" else 1 + else: + tax_id = columns[column_index] + if tax_id not in existing_tax_ids: + new_tax_ids.add(tax_id) + return new_tax_ids + + +def fetch_ena_jsonl(tax_id, f_out): + print("Fetching new tax_ids from ENA API") + url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/" + with urlopen(url + tax_id) as response: + for line in response: + f_out.write(line.decode("utf-8").strip()) + f_out.write("\n") + + +@task(log_prints=True) +def update_ena_jsonl(new_tax_ids: set[str], output_path: str, append: bool) -> None: + print(f"Updating ENA JSONL file at {output_path} with new tax IDs") + url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/" + try: + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "a" if append else "w") as f: + for tax_id in tqdm(new_tax_ids, desc="Fetching ENA tax IDs"): + try: + with urlopen(url + tax_id) as response: + for line in response: + f.write(line.decode("utf-8").strip()) + f.write("\n") + except Exception as e: + print(f"Error fetching {tax_id}: {e}") + except Exception as e: + print(f"Error updating {output_path}: {e}") + + +@flow() +def update_ena_taxonomy_extra( + root_taxid: str, taxdump_path: str, output_path: str, append: bool +) -> None: + """Update the ENA taxonomy JSONL file. + + Args: + root_taxid (str): Root taxon ID to filter by. + taxdump_path (str): Path to the NCBI taxdump files. + output_path (str): Path to save the taxonomy dump. + append (bool): Flag to append entries to an existing JSONL file. + """ + + # 1. read IDs from ncbi nodes file + existing_tax_ids = read_ncbi_tax_ids(taxdump_path) + # 2. read existing IDs from local JSONL file + if append: + add_jsonl_tax_ids(output_path, existing_tax_ids) + # 3. fetch list of new IDs from ENA API + new_tax_ids = get_ena_api_new_taxids(root_taxid, existing_tax_ids) + # 4. fetch details for new IDs from ENA API and save to JSONL file + update_ena_jsonl(new_tax_ids, output_path, append) + + # http_path = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" + # status = None + # if taxonomy_is_up_to_date(output_path, http_path): + # status = True + # else: + # status = False + # fetch_ncbi_taxonomy( + # local_path=output_path, http_path=http_path, root_taxid=root_taxid + # ) + # print(f"Taxonomy update status: {status}") + + # emit_event( + # event="update.ncbi.taxonomy.finished", + # resource={ + # "prefect.resource.id": f"fetch.taxonomy.{output_path}", + # "prefect.resource.type": "ncbi.taxonomy", + # "prefect.resource.matches.previous": "yes" if status else "no", + # }, + # payload={"matches_previous": status}, + # ) + # return status + + +if __name__ == "__main__": + """Run the flow.""" + args = parse_args( + [ + default(ROOT_TAXID, "2759"), + required(TAXDUMP_PATH), + required(OUTPUT_PATH), + APPEND, + # S3_PATH, + ], + "Fetch extra taxa from ENA taxonomy API and optionally filter by root taxon.", + ) + + update_ena_taxonomy_extra(**vars(args)) diff --git a/flows/updaters/update_ott_taxonomy.py b/flows/updaters/update_ott_taxonomy.py new file mode 100644 index 0000000..003b7df --- /dev/null +++ b/flows/updaters/update_ott_taxonomy.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 + +# sourcery skip: avoid-builtin-shadow +import os +import subprocess +import sys +from os.path import abspath, dirname + +if __name__ == "__main__" and __package__ is None: + sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + __package__ = "flows" + +import json + +from flows.lib.conditional_import import emit_event, flow, task +from flows.lib.shared_args import ( + OUTPUT_PATH, + ROOT_TAXID, + S3_PATH, + default, + parse_args, + required, +) +from flows.lib.utils import is_local_file_current_http + + +@task(retries=2, retry_delay_seconds=2, log_prints=True) +def fetch_ott_taxonomy( + root_taxid: str, + local_path: str, + http_path: str, +) -> bool: + """ + Fetch the OTT taxonomy and filter by root taxon if specified. + + Args: + root_taxid (str): Root taxon ID to filter by. + http_path (str): URL to fetch the taxonomy from. + local_path (str): Path to save the taxonomy. + + Returns: + bool: True if the fetched file matches the remote version, False otherwise. + """ + # create local_path if it doesn't exist + os.makedirs(local_path, exist_ok=True) + local_gz_file = f"{local_path}/ott.tar.gz" + # Fetch the remote file + cmd = ["curl", "-sSL", http_path, "-o", local_gz_file] + print(f"Running command: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + + # extract the tar.gz file + cmd = ["tar", "-xzf", local_gz_file, "-C", local_path] + print(f"Running command: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + + # Find the extracted subdirectory (should start with 'ott') + extracted_dirs = [ + d + for d in os.listdir(local_path) + if os.path.isdir(os.path.join(local_path, d)) and d.startswith("ott") + ] + if not extracted_dirs: + raise RuntimeError("No extracted ott directory found.") + ott_dir = os.path.join(local_path, extracted_dirs[0]) + + # Move all files from the ott subdirectory to local_path + for fname in os.listdir(ott_dir): + src = os.path.join(ott_dir, fname) + dst = os.path.join(local_path, fname) + if os.path.exists(dst): + os.remove(dst) + os.rename(src, dst) + + # Remove the now-empty ott subdirectory + os.rmdir(ott_dir) + + # set the timestamp of extracted files to match the tar.gz file + gz_mtime = os.path.getmtime(local_gz_file) + for fname in os.listdir(local_path): + fpath = os.path.join(local_path, fname) + if os.path.isfile(fpath): + os.utime(fpath, (gz_mtime, gz_mtime)) + + # remove the tar.gz file + os.remove(local_gz_file) + + return True + + +@task(log_prints=True) +def ott_taxonomy_is_up_to_date(local_path: str, http_path: str) -> bool: + """ + Check if the local OTT taxonomy file is up-to-date with the remote file. + + Args: + local_path (str): Path to the local file. + http_path (str): Path to the remote file on HTTP. + + Returns: + bool: True if the local file is up-to-date, False otherwise. + """ + return is_local_file_current_http(f"{local_path}/taxonomy.tsv", http_path) + + +def set_ott_url() -> str: + """Set the OTT URL. + + Returns: + str: The OTT URL. + """ + + # Fetch OTT taxonomy info as JSON + cmd = [ + "curl", + "-X", + "POST", + "-s", + "https://api.opentreeoflife.org/v3/taxonomy/about", + ] + print(f"Running command: {' '.join(cmd)}") + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + ott_json = json.loads(result.stdout) + + # Extract required fields + source = ott_json.get("source", "") + name = ott_json.get("name", "") + version = ott_json.get("version", "") + + # Replace "draft" with "." in source to get OTT_VERSION + ott_version = source.replace("draft", ".") + ott_major_version = f"{name}{version}" + + return ( + f"https://files.opentreeoflife.org/ott/" + f"{ott_major_version}/{ott_version}.tgz" + ) + + +@flow() +def update_ott_taxonomy(root_taxid: str, output_path: str, s3_path: str) -> None: + """Fetch and optionally update the OTT taxonomy file. + + Args: + root_taxid (str): Root taxon ID to filter by. + output_path (str): Path to save the taxonomy dump. + s3_path (str): S3 path to compare with. + """ + http_path = set_ott_url() + status = None + complete = False + if ott_taxonomy_is_up_to_date(output_path, http_path): + status = True + complete = True + else: + status = False + complete = fetch_ott_taxonomy( + local_path=output_path, http_path=http_path, root_taxid=root_taxid + ) + print(f"OTT taxonomy file matches previous: {status}") + + if complete: + emit_event( + event="update.ott.taxonomy.finished", + resource={ + "prefect.resource.id": f"fetch.ott.taxonomy.{output_path}", + "prefect.resource.type": "ott.taxonomy", + "prefect.resource.matches.previous": "yes" if status else "no", + }, + payload={"matches_previous": status}, + ) + return status + + +if __name__ == "__main__": + """Run the flow.""" + args = parse_args( + [default(ROOT_TAXID, "taxon"), required(OUTPUT_PATH), S3_PATH], + "Fetch OTT taxonomy file and optionally filter by root taxon.", + ) + + update_ott_taxonomy(**vars(args)) diff --git a/flows/updaters/update_tolid_prefixes.py b/flows/updaters/update_tolid_prefixes.py index 679be7f..8a4ba0c 100644 --- a/flows/updaters/update_tolid_prefixes.py +++ b/flows/updaters/update_tolid_prefixes.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +# sourcery skip: avoid-builtin-shadow import os import subprocess import sys From 307fbf729e46b6d7317d8cb2a775d24eaa7c8c73 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 12 Sep 2025 10:39:57 +0100 Subject: [PATCH 10/44] use s3 file for ena taxonomy extra update --- flows/lib/utils.py | 56 ++++++++++++++++++ flows/updaters/update_ena_taxonomy_extra.py | 63 ++++++++++++--------- 2 files changed, 92 insertions(+), 27 deletions(-) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index 6bb8836..2db32cb 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -600,6 +600,62 @@ def find_s3_file(s3_path: list, filename: str) -> str: return None +def fetch_from_s3(s3_path: str, local_path: str) -> None: + """ + Fetch a file from S3. + + Args: + s3_path (str): Path to the remote file on s3. + local_path (str): Path to the local file. + + Returns: + None: This function downloads the file from S3 to the local path. + """ + s3 = boto3.client("s3") + + # Extract bucket name and key from the S3 path + def parse_s3_path(s3_path): + bucket, key = s3_path.removeprefix("s3://").split("/", 1) + return bucket, key + + bucket, key = parse_s3_path(s3_path) + + # Download the file from S3 to the local path + try: + s3.download_file(Bucket=bucket, Key=key, Filename=local_path) + except ClientError as e: + print(f"Error downloading {s3_path} to {local_path}: {e}") + raise e + + +def upload_to_s3(local_path: str, s3_path: str) -> None: + """ + Upload a file to S3. + + Args: + local_path (str): Path to the local file. + s3_path (str): Path to the remote file on s3. + + Returns: + None: This function uploads the local file to S3. + """ + s3 = boto3.client("s3") + + # Extract bucket name and key from the S3 path + def parse_s3_path(s3_path): + bucket, key = s3_path.removeprefix("s3://").split("/", 1) + return bucket, key + + bucket, key = parse_s3_path(s3_path) + + # Upload the file to S3 + try: + s3.upload_file(Filename=local_path, Bucket=bucket, Key=key) + except ClientError as e: + print(f"Error uploading {local_path} to {s3_path}: {e}") + raise e + + def set_index_name( index_type: str, hub_name: str, diff --git a/flows/updaters/update_ena_taxonomy_extra.py b/flows/updaters/update_ena_taxonomy_extra.py index 3c06976..faa6781 100644 --- a/flows/updaters/update_ena_taxonomy_extra.py +++ b/flows/updaters/update_ena_taxonomy_extra.py @@ -8,6 +8,8 @@ from tqdm import tqdm +from flows.lib.utils import fetch_from_s3, upload_to_s3 + if __name__ == "__main__" and __package__ is None: sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) __package__ = "flows" @@ -110,9 +112,19 @@ def update_ena_jsonl(new_tax_ids: set[str], output_path: str, append: bool) -> N print(f"Error updating {output_path}: {e}") +def fetch_s3_jsonl(s3_path: str, local_path: str) -> None: + print(f"Fetching existing ENA JSONL file from {s3_path} to {local_path}") + fetch_from_s3(s3_path, local_path) + + +def upload_s3_jsonl(local_path: str, s3_path: str) -> None: + print(f"Uploading updated ENA JSONL file from {local_path} to {s3_path}") + upload_to_s3(local_path, s3_path) + + @flow() def update_ena_taxonomy_extra( - root_taxid: str, taxdump_path: str, output_path: str, append: bool + root_taxid: str, taxdump_path: str, output_path: str, s3_path: str, append: bool ) -> None: """Update the ENA taxonomy JSONL file. @@ -125,35 +137,32 @@ def update_ena_taxonomy_extra( # 1. read IDs from ncbi nodes file existing_tax_ids = read_ncbi_tax_ids(taxdump_path) - # 2. read existing IDs from local JSONL file + # 2. fetch jsonl file from s3 if s3_path is provided + if s3_path: + fetch_s3_jsonl(s3_path, output_path) + # 3. read existing IDs from local JSONL file if append: add_jsonl_tax_ids(output_path, existing_tax_ids) - # 3. fetch list of new IDs from ENA API + # 4. fetch list of new IDs from ENA API new_tax_ids = get_ena_api_new_taxids(root_taxid, existing_tax_ids) - # 4. fetch details for new IDs from ENA API and save to JSONL file + # 5. fetch details for new IDs from ENA API and save to JSONL file update_ena_jsonl(new_tax_ids, output_path, append) - - # http_path = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" - # status = None - # if taxonomy_is_up_to_date(output_path, http_path): - # status = True - # else: - # status = False - # fetch_ncbi_taxonomy( - # local_path=output_path, http_path=http_path, root_taxid=root_taxid - # ) - # print(f"Taxonomy update status: {status}") - - # emit_event( - # event="update.ncbi.taxonomy.finished", - # resource={ - # "prefect.resource.id": f"fetch.taxonomy.{output_path}", - # "prefect.resource.type": "ncbi.taxonomy", - # "prefect.resource.matches.previous": "yes" if status else "no", - # }, - # payload={"matches_previous": status}, - # ) - # return status + # 6. upload updated JSONL file to s3 if s3_path is provided + if s3_path: + upload_s3_jsonl(output_path, s3_path) + + status = len(new_tax_ids) == 0 + + emit_event( + event="update.ena.taxonomy.finished", + resource={ + "prefect.resource.id": f"fetch.taxonomy.{output_path}", + "prefect.resource.type": "ena.taxonomy", + "prefect.resource.matches.previous": ("yes" if status else "no"), + }, + payload={"matches_previous": status}, + ) + return status if __name__ == "__main__": @@ -163,8 +172,8 @@ def update_ena_taxonomy_extra( default(ROOT_TAXID, "2759"), required(TAXDUMP_PATH), required(OUTPUT_PATH), + S3_PATH, APPEND, - # S3_PATH, ], "Fetch extra taxa from ENA taxonomy API and optionally filter by root taxon.", ) From fe622c351028f50be61c3ae42b4d4108c16463ac Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 12 Sep 2025 10:40:54 +0100 Subject: [PATCH 11/44] make 3s fetch conditional on append --- flows/updaters/update_ena_taxonomy_extra.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flows/updaters/update_ena_taxonomy_extra.py b/flows/updaters/update_ena_taxonomy_extra.py index faa6781..853f91c 100644 --- a/flows/updaters/update_ena_taxonomy_extra.py +++ b/flows/updaters/update_ena_taxonomy_extra.py @@ -137,11 +137,11 @@ def update_ena_taxonomy_extra( # 1. read IDs from ncbi nodes file existing_tax_ids = read_ncbi_tax_ids(taxdump_path) - # 2. fetch jsonl file from s3 if s3_path is provided - if s3_path: - fetch_s3_jsonl(s3_path, output_path) - # 3. read existing IDs from local JSONL file if append: + # 2. fetch jsonl file from s3 if s3_path is provided + if s3_path: + fetch_s3_jsonl(s3_path, output_path) + # 3. read existing IDs from local JSONL file add_jsonl_tax_ids(output_path, existing_tax_ids) # 4. fetch list of new IDs from ENA API new_tax_ids = get_ena_api_new_taxids(root_taxid, existing_tax_ids) From 56ddc62c532fa05155bd50a0e32985dcf079a72e Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 12 Sep 2025 13:59:21 +0100 Subject: [PATCH 12/44] add genomehubs taxonomy updater --- flows/prefect.yaml | 79 +++++++++ flows/updaters/update_ena_taxonomy_extra.py | 11 +- flows/updaters/update_genomehubs_taxonomy.py | 161 +++++++++++++++++++ 3 files changed, 247 insertions(+), 4 deletions(-) create mode 100644 flows/updaters/update_genomehubs_taxonomy.py diff --git a/flows/prefect.yaml b/flows/prefect.yaml index 96d98f9..7aec390 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -30,6 +30,8 @@ definitions: cron: "5 0 * * 1-6" weekly: &weekly # Runs every Monday at 00:05 UTC cron: "5 0 * * 1" + monthly: &monthly # Runs on the 1st of every month at 00:05 UTC + cron: "5 0 1 * *" deployments: - name: update-ncbi-datasets @@ -82,3 +84,80 @@ deployments: append: true dry_run: true work_pool: *goat_data_work_pool + + - name: update-tolid-prefixes + # This flow updates the TOLID prefixes file + entrypoint: flows/updaters/update_tolid_prefixes.py:update_tolid_prefixes + parameters: + # Local directory path to save the TOLID prefixes TSV file + output_path: "/home/ubuntu/tmp/test/tolid-prefixes" + schedules: + - *weekly + work_pool: *goat_data_work_pool + + - name: update-ott-taxonomy + # This flow updates the OTT taxonomy file + entrypoint: flows/updaters/update_ott_taxonomy.py:update_ott_taxonomy + parameters: + # Local directory path to save the OTT taxonomy file + output_path: "/home/ubuntu/tmp/test/ott-taxonomy" + schedules: + - *monthly + work_pool: *goat_data_work_pool + + - name: update-ncbi-taxonomy + # This flow updates the NCBI taxonomy dump + entrypoint: flows/updaters/update_ncbi_taxonomy.py:update_ncbi_taxonomy + parameters: + # Local path to save the NCBI taxonomy dump files + output_path: "/home/ubuntu/tmp/test/ncbi-taxonomy" + schedules: + - *daily + work_pool: *goat_data_work_pool + + - name: update-ena-taxonomy-extra + # This flow updates the ENA taxonomy dump + entrypoint: flows/updaters/update_ena_taxonomy_extra.py:update_ena_taxonomy_extra + parameters: + # Local path to the NCBI taxonomy dump files + taxdump_path: "/home/ubuntu/tmp/test/ncbi-taxonomy" + # Local path to save the ENA taxonomy JSONL files + output_path: "/home/ubuntu/tmp/test/ena-taxonomy" + # A flag to indicate if the flow should append to the existing JSONL file + append: true + triggers: + - enabled: true + match: + prefect.resource.type: ncbi.taxonomy + expect: + - update.ncbi.taxonomy.finished + parameters: + taxdump_path: "/home/ubuntu/tmp/test/ncbi-taxonomy" + output_path: "/home/ubuntu/tmp/test/ena-taxonomy" + s3_path: "s3://goat/resources/taxonomy/ena/ena-taxonomy-extra.jsonl" + append: true + work_pool: *goat_data_work_pool + + - name: update-genomehubs-taxonomy + # This flow updates the GenomeHubs taxonomy file + entrypoint: flows/updaters/update_genomehubs_taxonomy.py:update_genomehubs_taxonomy + parameters: + # The root taxon Id to use. 2759 is the taxon Id for Eukaryota + root_taxid: "2759" + # Local path to the input configuration YAML file + input_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota_taxonomy_input.yaml" + # Local path to save the GenomeHubs taxonomy JSONL file + output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs_taxonomy_eukaryota.jsonl" + # The S3 path to save the GenomeHubs taxonomy JSONL file + s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs_taxonomy_eukaryota.jsonl" + triggers: + - enabled: true + match: + prefect.resource.type: ena.taxonomy + expect: + - update.ena.taxonomy.finished + parameters: + root_taxid: "2759" + input_path: "" + output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs_taxonomy_eukaryota.jsonl" + s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs_taxonomy_eukaryota.jsonl" diff --git a/flows/updaters/update_ena_taxonomy_extra.py b/flows/updaters/update_ena_taxonomy_extra.py index 853f91c..de1fb20 100644 --- a/flows/updaters/update_ena_taxonomy_extra.py +++ b/flows/updaters/update_ena_taxonomy_extra.py @@ -8,8 +8,6 @@ from tqdm import tqdm -from flows.lib.utils import fetch_from_s3, upload_to_s3 - if __name__ == "__main__" and __package__ is None: sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) __package__ = "flows" @@ -25,9 +23,10 @@ parse_args, required, ) +from flows.lib.utils import fetch_from_s3, upload_to_s3 -@task() +@task(log_prints=True) def read_ncbi_tax_ids(taxdump_path: str) -> set[str]: """Read NCBI tax IDs from the taxdump nodes file.""" print(f"Reading NCBI taxids from {taxdump_path}") @@ -41,7 +40,7 @@ def read_ncbi_tax_ids(taxdump_path: str) -> set[str]: return tax_ids -@task() +@task(log_prints=True) def add_jsonl_tax_ids(jsonl_path: str, tax_ids: set[str]) -> None: print(f"Reading previously fetched ENA taxids from {jsonl_path}") filtered_path = f"{jsonl_path}.filtered" @@ -84,6 +83,7 @@ def get_ena_api_new_taxids(root_taxid: str, existing_tax_ids: set[str]) -> set[s return new_tax_ids +@task(log_prints=True) def fetch_ena_jsonl(tax_id, f_out): print("Fetching new tax_ids from ENA API") url = "https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/" @@ -112,11 +112,13 @@ def update_ena_jsonl(new_tax_ids: set[str], output_path: str, append: bool) -> N print(f"Error updating {output_path}: {e}") +@task(log_prints=True) def fetch_s3_jsonl(s3_path: str, local_path: str) -> None: print(f"Fetching existing ENA JSONL file from {s3_path} to {local_path}") fetch_from_s3(s3_path, local_path) +@task(log_prints=True) def upload_s3_jsonl(local_path: str, s3_path: str) -> None: print(f"Uploading updated ENA JSONL file from {local_path} to {s3_path}") upload_to_s3(local_path, s3_path) @@ -132,6 +134,7 @@ def update_ena_taxonomy_extra( root_taxid (str): Root taxon ID to filter by. taxdump_path (str): Path to the NCBI taxdump files. output_path (str): Path to save the taxonomy dump. + s3_path (str): S3 path to upload the taxonomy dump. append (bool): Flag to append entries to an existing JSONL file. """ diff --git a/flows/updaters/update_genomehubs_taxonomy.py b/flows/updaters/update_genomehubs_taxonomy.py new file mode 100644 index 0000000..ce4a112 --- /dev/null +++ b/flows/updaters/update_genomehubs_taxonomy.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 + +import subprocess +import sys +from collections import defaultdict +from os.path import abspath, dirname + +import yaml + +if __name__ == "__main__" and __package__ is None: + sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + __package__ = "flows" + +from flows.lib.conditional_import import emit_event, flow, task +from flows.lib.shared_args import ( + INPUT_PATH, + OUTPUT_PATH, + ROOT_TAXID, + S3_PATH, + default, + parse_args, + required, +) +from flows.lib.utils import fetch_from_s3, upload_to_s3 + + +def get_file_paths_from_config(config: dict, file_paths: dict) -> dict: + key = config.get("xref_label") + input_path = config.get("path") + output_path = config.get("out") + if key is not None and input_path is not None: + file_paths[key] = { + "input": input_path, + } + return output_path + + +@task(log_prints=True) +def read_input_config(input_path: str) -> dict: + print(f"Reading input config from {input_path}") + file_paths = defaultdict(dict) + try: + with open(input_path, "r") as f: + config = yaml.safe_load(f) + except Exception as e: + print(f"Error reading {input_path}: {e}") + exit() + try: + output_path = get_file_paths_from_config(config, file_paths) + if output_path is not None: + file_paths["out"] = output_path + for taxonomy in config.get("taxonomies", []): + get_file_paths_from_config(taxonomy, file_paths) + except Exception as e: + print(f"Error parsing {input_path}: {e}") + exit() + return file_paths + + +@task(log_prints=True) +def run_blobtk_taxonomy(root_taxid: str, input_path: str, output_path: str) -> None: + print(f"Running blobtk taxonomy with root taxid {root_taxid}") + cmd = [ + "blobtk", + "taxonomy", + "-c", + input_path, + "-o", + output_path, + "-r", + root_taxid, + ] + print(f"Running command: {' '.join(cmd)}") + try: + subprocess.run(cmd, check=True) + except Exception as e: + print(f"Error running blobtk taxonomy: {e}") + exit() + + +@task(log_prints=True) +def fetch_s3_file(s3_path: str, local_path: str) -> None: + print(f"Fetching file from {s3_path} to {local_path}") + fetch_from_s3(s3_path, local_path) + + +@task(log_prints=True) +def upload_s3_file(local_path: str, s3_path: str) -> None: + print(f"Uploading file from {local_path} to {s3_path}") + upload_to_s3(local_path, s3_path) + + +@flow() +def update_genomehubs_taxonomy( + root_taxid: str, input_path: str, output_path: str, s3_path: str +) -> None: + """Update the GenomeHubs taxonomy JSONL file. + + Args: + root_taxid (str): Root taxon ID to filter by. + input_path (str): Path to the input config file. + output_path (str): Path to save the taxonomy dump. + s3_path (str): S3 path to upload the taxonomy dump. + """ + + # 1. parse input config yaml + file_paths = read_input_config(input_path) + + # 2. check files exist locally + for key, paths in file_paths.items(): + if "input" in paths: + try: + with open(paths["input"], "r"): + pass + except FileNotFoundError: + print(f"Error: {paths['input']} not found") + exit() + except Exception as e: + print(f"Error reading {paths['input']}: {e}") + exit() + # 3. run blobtk to collate and filter taxonomies + run_blobtk_taxonomy(root_taxid, input_path, output_path) + + # 4. upload updated JSONL file to s3 if s3_path is provided + if s3_path: + upload_s3_file(f"{output_path}/nodes.jsonl", s3_path) + + # 5. count lines in output file + line_count = 0 + try: + with open(f"{output_path}/nodes.jsonl", "r") as f: + line_count = sum(1 for _ in f) + print(f"Output file has {line_count} lines") + except Exception as e: + print(f"Error reading {output_path}/nodes.jsonl: {e}") + exit() + + emit_event( + event="update.genomehubs.taxonomy.finished", + resource={ + "prefect.resource.id": f"fetch.taxonomy.{output_path}", + "prefect.resource.type": "genomehubs.taxonomy", + # "prefect.resource.matches.previous": ("yes" if status else "no"), + }, + payload={"line_count": line_count}, + ) + + +if __name__ == "__main__": + """Run the flow.""" + args = parse_args( + [ + default(ROOT_TAXID, "2759"), + required(INPUT_PATH), + required(OUTPUT_PATH), + S3_PATH, + ], + "Collate source taxonomies and names into GenomeHubs JSONL taxonomy format.", + ) + + update_genomehubs_taxonomy(**vars(args)) From ba03bafae2f00e953595b637c34f4e52aa6bbb93 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 12 Sep 2025 14:00:32 +0100 Subject: [PATCH 13/44] comment out atypical assembly filter --- flows/parsers/parse_ncbi_assemblies.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index f2fad6c..1841ed3 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -120,8 +120,9 @@ def process_assembly_report( Returns: dict: The updated report dictionary. """ - if is_atypical_assembly(report, parsed): - return None + # Uncomment to filter atypical assemblies + # if is_atypical_assembly(report, parsed): + # return None processed_report = {**report, "processedAssemblyInfo": {"organelle": "nucleus"}} if "pairedAccession" in report: if processed_report["pairedAccession"].startswith("GCF_"): From 68cf6be23772b80c4e4d97aceaa2687db93c83bf Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 12 Sep 2025 14:26:16 +0100 Subject: [PATCH 14/44] run flow on feature branch for testing --- flows/prefect.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index 7aec390..d96ecee 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -17,7 +17,7 @@ pull: # Pull the genomehubs/data repository to get the latest flows id: clone-data repository: https://github.com/genomehubs/data.git - branch: main + branch: feature/pipeline-updates definitions: work_pools: @@ -121,8 +121,8 @@ deployments: parameters: # Local path to the NCBI taxonomy dump files taxdump_path: "/home/ubuntu/tmp/test/ncbi-taxonomy" - # Local path to save the ENA taxonomy JSONL files - output_path: "/home/ubuntu/tmp/test/ena-taxonomy" + # Local path to save the ENA taxonomy JSONL file + output_path: "/home/ubuntu/tmp/test/ena-taxonomy/ena-taxonomy-extra.jsonl" # A flag to indicate if the flow should append to the existing JSONL file append: true triggers: @@ -145,11 +145,11 @@ deployments: # The root taxon Id to use. 2759 is the taxon Id for Eukaryota root_taxid: "2759" # Local path to the input configuration YAML file - input_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota_taxonomy_input.yaml" + input_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota-taxonomy-input.yaml" # Local path to save the GenomeHubs taxonomy JSONL file - output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs_taxonomy_eukaryota.jsonl" + output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota/nodes.jsonl" # The S3 path to save the GenomeHubs taxonomy JSONL file - s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs_taxonomy_eukaryota.jsonl" + s3_path: "s3://goat/resources/taxonomy/genomehubs/eukaryota-nodes.jsonl" triggers: - enabled: true match: @@ -158,6 +158,6 @@ deployments: - update.ena.taxonomy.finished parameters: root_taxid: "2759" - input_path: "" + input_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota_taxonomy_input.yaml" output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs_taxonomy_eukaryota.jsonl" s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs_taxonomy_eukaryota.jsonl" From b77d9ff9f8c156dcd0b0594946ca021d900ff5ce Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 12 Sep 2025 14:37:15 +0100 Subject: [PATCH 15/44] remove unused variables --- flows/prefect.yaml | 3 +++ flows/updaters/update_ncbi_taxonomy.py | 27 +++++++------------------ flows/updaters/update_ott_taxonomy.py | 25 ++++++----------------- flows/updaters/update_tolid_prefixes.py | 23 ++++++--------------- 4 files changed, 22 insertions(+), 56 deletions(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index d96ecee..0f4a449 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -123,6 +123,8 @@ deployments: taxdump_path: "/home/ubuntu/tmp/test/ncbi-taxonomy" # Local path to save the ENA taxonomy JSONL file output_path: "/home/ubuntu/tmp/test/ena-taxonomy/ena-taxonomy-extra.jsonl" + # The S3 path to save the ENA taxonomy JSONL file + s3_path: "s3://goat/resources/taxonomy/ena/ena-taxonomy-extra.jsonl" # A flag to indicate if the flow should append to the existing JSONL file append: true triggers: @@ -161,3 +163,4 @@ deployments: input_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota_taxonomy_input.yaml" output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs_taxonomy_eukaryota.jsonl" s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs_taxonomy_eukaryota.jsonl" + work_pool: *goat_data_work_pool diff --git a/flows/updaters/update_ncbi_taxonomy.py b/flows/updaters/update_ncbi_taxonomy.py index 1e37a37..3b3d8e7 100644 --- a/flows/updaters/update_ncbi_taxonomy.py +++ b/flows/updaters/update_ncbi_taxonomy.py @@ -10,28 +10,19 @@ __package__ = "flows" from flows.lib.conditional_import import emit_event, flow, task -from flows.lib.shared_args import ( - OUTPUT_PATH, - ROOT_TAXID, - S3_PATH, - default, - parse_args, - required, -) +from flows.lib.shared_args import OUTPUT_PATH, parse_args, required from flows.lib.utils import generate_md5, is_local_file_current_http @task(retries=2, retry_delay_seconds=2, log_prints=True) def fetch_ncbi_taxonomy( - root_taxid: str, local_path: str, http_path: str = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz", ) -> bool: """ - Fetch the NCBI taxonomy dump and filter by root taxon if specified. + Fetch the NCBI taxonomy dump. Args: - root_taxid (str): Root taxon ID to filter by. http_path (str): URL to fetch the taxonomy dump from. local_path (str): Path to save the taxonomy dump. @@ -95,13 +86,11 @@ def taxonomy_is_up_to_date(local_path: str, http_path: str) -> bool: @flow() -def update_ncbi_taxonomy(root_taxid: str, output_path: str, s3_path: str) -> None: - """Fetch and optionally update the NCBI taxonomy dump. +def update_ncbi_taxonomy(output_path: str) -> None: + """Fetch and the NCBI taxonomy dump. Args: - root_taxid (str): Root taxon ID to filter by. output_path (str): Path to save the taxonomy dump. - s3_path (str): S3 path to compare with. """ http_path = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz" status = None @@ -109,9 +98,7 @@ def update_ncbi_taxonomy(root_taxid: str, output_path: str, s3_path: str) -> Non status = True else: status = False - fetch_ncbi_taxonomy( - local_path=output_path, http_path=http_path, root_taxid=root_taxid - ) + fetch_ncbi_taxonomy(local_path=output_path, http_path=http_path) print(f"Taxonomy update status: {status}") emit_event( @@ -129,8 +116,8 @@ def update_ncbi_taxonomy(root_taxid: str, output_path: str, s3_path: str) -> Non if __name__ == "__main__": """Run the flow.""" args = parse_args( - [default(ROOT_TAXID, "taxon"), required(OUTPUT_PATH), S3_PATH], - "Fetch NCBI taxdump and optionally filter by root taxon.", + [required(OUTPUT_PATH)], + "Fetch NCBI taxdump.", ) update_ncbi_taxonomy(**vars(args)) diff --git a/flows/updaters/update_ott_taxonomy.py b/flows/updaters/update_ott_taxonomy.py index 003b7df..08407ed 100644 --- a/flows/updaters/update_ott_taxonomy.py +++ b/flows/updaters/update_ott_taxonomy.py @@ -13,20 +13,12 @@ import json from flows.lib.conditional_import import emit_event, flow, task -from flows.lib.shared_args import ( - OUTPUT_PATH, - ROOT_TAXID, - S3_PATH, - default, - parse_args, - required, -) +from flows.lib.shared_args import OUTPUT_PATH, parse_args, required from flows.lib.utils import is_local_file_current_http @task(retries=2, retry_delay_seconds=2, log_prints=True) def fetch_ott_taxonomy( - root_taxid: str, local_path: str, http_path: str, ) -> bool: @@ -34,7 +26,6 @@ def fetch_ott_taxonomy( Fetch the OTT taxonomy and filter by root taxon if specified. Args: - root_taxid (str): Root taxon ID to filter by. http_path (str): URL to fetch the taxonomy from. local_path (str): Path to save the taxonomy. @@ -138,13 +129,11 @@ def set_ott_url() -> str: @flow() -def update_ott_taxonomy(root_taxid: str, output_path: str, s3_path: str) -> None: - """Fetch and optionally update the OTT taxonomy file. +def update_ott_taxonomy(output_path: str) -> None: + """Fetch the OTT taxonomy file. Args: - root_taxid (str): Root taxon ID to filter by. output_path (str): Path to save the taxonomy dump. - s3_path (str): S3 path to compare with. """ http_path = set_ott_url() status = None @@ -154,9 +143,7 @@ def update_ott_taxonomy(root_taxid: str, output_path: str, s3_path: str) -> None complete = True else: status = False - complete = fetch_ott_taxonomy( - local_path=output_path, http_path=http_path, root_taxid=root_taxid - ) + complete = fetch_ott_taxonomy(local_path=output_path, http_path=http_path) print(f"OTT taxonomy file matches previous: {status}") if complete: @@ -175,8 +162,8 @@ def update_ott_taxonomy(root_taxid: str, output_path: str, s3_path: str) -> None if __name__ == "__main__": """Run the flow.""" args = parse_args( - [default(ROOT_TAXID, "taxon"), required(OUTPUT_PATH), S3_PATH], - "Fetch OTT taxonomy file and optionally filter by root taxon.", + [required(OUTPUT_PATH)], + "Fetch OTT taxonomy file.", ) update_ott_taxonomy(**vars(args)) diff --git a/flows/updaters/update_tolid_prefixes.py b/flows/updaters/update_tolid_prefixes.py index 8a4ba0c..61386ce 100644 --- a/flows/updaters/update_tolid_prefixes.py +++ b/flows/updaters/update_tolid_prefixes.py @@ -11,20 +11,12 @@ __package__ = "flows" from flows.lib.conditional_import import emit_event, flow, task -from flows.lib.shared_args import ( - OUTPUT_PATH, - ROOT_TAXID, - S3_PATH, - default, - parse_args, - required, -) +from flows.lib.shared_args import OUTPUT_PATH, parse_args, required from flows.lib.utils import is_local_file_current_http @task(retries=2, retry_delay_seconds=2, log_prints=True) def fetch_tolid_prefixes( - root_taxid: str, local_path: str, http_path: str = ( "https://gitlab.com/wtsi-grit/darwin-tree-of-life-sample-naming/" @@ -36,7 +28,6 @@ def fetch_tolid_prefixes( Fetch the ToLID prefix file and filter by root taxon if specified. Args: - root_taxid (str): Root taxon ID to filter by. http_path (str): URL to fetch the ToLID prefix file from. local_path (str): Path to save the ToLID prefix file. @@ -77,13 +68,11 @@ def tolid_file_is_up_to_date(local_path: str, http_path: str) -> bool: @flow() -def update_tolid_prefixes(root_taxid: str, output_path: str, s3_path: str) -> None: - """Fetch and optionally update the ToLID prefixes file. +def update_tolid_prefixes(output_path: str) -> None: + """Fetch the ToLID prefixes file. Args: - root_taxid (str): Root taxon ID to filter by. output_path (str): Path to save the taxonomy dump. - s3_path (str): S3 path to compare with. """ http_path = ( "https://gitlab.com/wtsi-grit/darwin-tree-of-life-sample-naming/" @@ -100,7 +89,7 @@ def update_tolid_prefixes(root_taxid: str, output_path: str, s3_path: str) -> No else: status = False complete, line_count = fetch_tolid_prefixes( - local_path=output_path, http_path=http_path, root_taxid=root_taxid + local_path=output_path, http_path=http_path ) print(f"TolID file matches previous: {status}") @@ -120,8 +109,8 @@ def update_tolid_prefixes(root_taxid: str, output_path: str, s3_path: str) -> No if __name__ == "__main__": """Run the flow.""" args = parse_args( - [default(ROOT_TAXID, "taxon"), required(OUTPUT_PATH), S3_PATH], - "Fetch ToLID prefixes and optionally filter by root taxon.", + [required(OUTPUT_PATH)], + "Fetch ToLID prefixes.", ) update_tolid_prefixes(**vars(args)) From 89ac1243ffb31578f4983fb4d388413e020d82bf Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 12 Sep 2025 14:42:59 +0100 Subject: [PATCH 16/44] update ena taxonomy trigger --- flows/prefect.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index 0f4a449..f50e067 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -135,7 +135,7 @@ deployments: - update.ncbi.taxonomy.finished parameters: taxdump_path: "/home/ubuntu/tmp/test/ncbi-taxonomy" - output_path: "/home/ubuntu/tmp/test/ena-taxonomy" + output_path: "/home/ubuntu/tmp/test/ena-taxonomy/ena-taxonomy-extra.jsonl" s3_path: "s3://goat/resources/taxonomy/ena/ena-taxonomy-extra.jsonl" append: true work_pool: *goat_data_work_pool From 69cc057ad02ba72a7f784ec552d0cde1535bbace Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 12 Sep 2025 14:47:55 +0100 Subject: [PATCH 17/44] add root taxon id parameter --- flows/prefect.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index f50e067..29510e1 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -119,6 +119,8 @@ deployments: # This flow updates the ENA taxonomy dump entrypoint: flows/updaters/update_ena_taxonomy_extra.py:update_ena_taxonomy_extra parameters: + # Root taxon Id to use. 2759 is the taxon Id for Eukaryota + root_taxid: "2759" # Local path to the NCBI taxonomy dump files taxdump_path: "/home/ubuntu/tmp/test/ncbi-taxonomy" # Local path to save the ENA taxonomy JSONL file @@ -134,6 +136,7 @@ deployments: expect: - update.ncbi.taxonomy.finished parameters: + root_taxid: "2759" taxdump_path: "/home/ubuntu/tmp/test/ncbi-taxonomy" output_path: "/home/ubuntu/tmp/test/ena-taxonomy/ena-taxonomy-extra.jsonl" s3_path: "s3://goat/resources/taxonomy/ena/ena-taxonomy-extra.jsonl" From fc739700553280097cb5ff5ffb5c644b2860c5a9 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 12 Sep 2025 15:46:31 +0100 Subject: [PATCH 18/44] gzip ena jsonl on s3 --- flows/lib/utils.py | 27 ++++++++++++++++++--- flows/updaters/update_ena_taxonomy_extra.py | 13 +++++----- 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index 2db32cb..def2622 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -1,8 +1,10 @@ #!/usr/bin/python3 import contextlib +import gzip import hashlib import os +import shutil from argparse import Action from csv import DictReader, Sniffer from datetime import datetime @@ -600,13 +602,14 @@ def find_s3_file(s3_path: list, filename: str) -> str: return None -def fetch_from_s3(s3_path: str, local_path: str) -> None: +def fetch_from_s3(s3_path: str, local_path: str, gz: bool = False) -> None: """ Fetch a file from S3. Args: s3_path (str): Path to the remote file on s3. local_path (str): Path to the local file. + gz (bool): Whether to gunzip the file after downloading. Defaults to False. Returns: None: This function downloads the file from S3 to the local path. @@ -622,19 +625,28 @@ def parse_s3_path(s3_path): # Download the file from S3 to the local path try: - s3.download_file(Bucket=bucket, Key=key, Filename=local_path) + if gz: + s3.download_file(Bucket=bucket, Key=key, Filename=f"{local_path}.gz") + # unzip the file + with gzip.open(f"{local_path}.gz", "rb") as f_in: + with open(local_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + os.remove(f"{local_path}.gz") + else: + s3.download_file(Bucket=bucket, Key=key, Filename=local_path) except ClientError as e: print(f"Error downloading {s3_path} to {local_path}: {e}") raise e -def upload_to_s3(local_path: str, s3_path: str) -> None: +def upload_to_s3(local_path: str, s3_path: str, gz: bool = False) -> None: """ Upload a file to S3. Args: local_path (str): Path to the local file. s3_path (str): Path to the remote file on s3. + gz (bool): Whether to gzip the file before uploading. Defaults to False. Returns: None: This function uploads the local file to S3. @@ -650,7 +662,14 @@ def parse_s3_path(s3_path): # Upload the file to S3 try: - s3.upload_file(Filename=local_path, Bucket=bucket, Key=key) + if gz: + with open(local_path, "rb") as f_in: + with gzip.open(f"{local_path}.gz", "wb") as f_out: + f_out.write(f_in.read()) + s3.upload_file(Filename=f"{local_path}.gz", Bucket=bucket, Key=key) + os.remove(f"{local_path}.gz") + else: + s3.upload_file(Filename=local_path, Bucket=bucket, Key=key) except ClientError as e: print(f"Error uploading {local_path} to {s3_path}: {e}") raise e diff --git a/flows/updaters/update_ena_taxonomy_extra.py b/flows/updaters/update_ena_taxonomy_extra.py index de1fb20..1c5db72 100644 --- a/flows/updaters/update_ena_taxonomy_extra.py +++ b/flows/updaters/update_ena_taxonomy_extra.py @@ -113,15 +113,15 @@ def update_ena_jsonl(new_tax_ids: set[str], output_path: str, append: bool) -> N @task(log_prints=True) -def fetch_s3_jsonl(s3_path: str, local_path: str) -> None: +def fetch_s3_jsonl(s3_path: str, local_path: str, gz: bool = False) -> None: print(f"Fetching existing ENA JSONL file from {s3_path} to {local_path}") - fetch_from_s3(s3_path, local_path) + fetch_from_s3(s3_path, local_path, gz=gz) @task(log_prints=True) -def upload_s3_jsonl(local_path: str, s3_path: str) -> None: +def upload_s3_jsonl(local_path: str, s3_path: str, gz: bool = False) -> None: print(f"Uploading updated ENA JSONL file from {local_path} to {s3_path}") - upload_to_s3(local_path, s3_path) + upload_to_s3(local_path, s3_path, gz=gz) @flow() @@ -143,7 +143,7 @@ def update_ena_taxonomy_extra( if append: # 2. fetch jsonl file from s3 if s3_path is provided if s3_path: - fetch_s3_jsonl(s3_path, output_path) + fetch_s3_jsonl(s3_path, output_path, gz=True) # 3. read existing IDs from local JSONL file add_jsonl_tax_ids(output_path, existing_tax_ids) # 4. fetch list of new IDs from ENA API @@ -152,7 +152,8 @@ def update_ena_taxonomy_extra( update_ena_jsonl(new_tax_ids, output_path, append) # 6. upload updated JSONL file to s3 if s3_path is provided if s3_path: - upload_s3_jsonl(output_path, s3_path) + # gzip the file first + upload_s3_jsonl(output_path, s3_path, gz=True) status = len(new_tax_ids) == 0 From ce9a7339dcf9b179dbad06cf72c119c2925ca969 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Fri, 12 Sep 2025 15:54:10 +0100 Subject: [PATCH 19/44] use gzipped s3 jsonl file --- flows/prefect.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index 29510e1..dbbc92b 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -139,7 +139,7 @@ deployments: root_taxid: "2759" taxdump_path: "/home/ubuntu/tmp/test/ncbi-taxonomy" output_path: "/home/ubuntu/tmp/test/ena-taxonomy/ena-taxonomy-extra.jsonl" - s3_path: "s3://goat/resources/taxonomy/ena/ena-taxonomy-extra.jsonl" + s3_path: "s3://goat/resources/taxonomy/ena/ena-taxonomy-extra.jsonl.gz" append: true work_pool: *goat_data_work_pool From a9067c47e429db8d56b85b9b93929380813e2c48 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 09:00:26 +0100 Subject: [PATCH 20/44] use s3cmd for s3 upload --- flows/lib/utils.py | 57 ++++++++++++++------- flows/updaters/update_ena_taxonomy_extra.py | 4 +- 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index def2622..bb2344e 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -4,7 +4,9 @@ import gzip import hashlib import os +import shlex import shutil +import subprocess from argparse import Action from csv import DictReader, Sniffer from datetime import datetime @@ -623,15 +625,17 @@ def parse_s3_path(s3_path): bucket, key = parse_s3_path(s3_path) + if s3_path.endswith(".gz") and not local_path.endswith(".gz"): + gz = True + # Download the file from S3 to the local path try: if gz: s3.download_file(Bucket=bucket, Key=key, Filename=f"{local_path}.gz") - # unzip the file - with gzip.open(f"{local_path}.gz", "rb") as f_in: - with open(local_path, "wb") as f_out: + with open(local_path, "rb") as f_in: + with gzip.open(f"{local_path}.gz", "wb") as f_out: shutil.copyfileobj(f_in, f_out) - os.remove(f"{local_path}.gz") + os.remove(local_path) else: s3.download_file(Bucket=bucket, Key=key, Filename=local_path) except ClientError as e: @@ -651,26 +655,41 @@ def upload_to_s3(local_path: str, s3_path: str, gz: bool = False) -> None: Returns: None: This function uploads the local file to S3. """ - s3 = boto3.client("s3") - - # Extract bucket name and key from the S3 path - def parse_s3_path(s3_path): - bucket, key = s3_path.removeprefix("s3://").split("/", 1) - return bucket, key - bucket, key = parse_s3_path(s3_path) + if s3_path.endswith(".gz") and not local_path.endswith(".gz"): + gz = True - # Upload the file to S3 try: if gz: - with open(local_path, "rb") as f_in: - with gzip.open(f"{local_path}.gz", "wb") as f_out: - f_out.write(f_in.read()) - s3.upload_file(Filename=f"{local_path}.gz", Bucket=bucket, Key=key) - os.remove(f"{local_path}.gz") + gz_path = f"{local_path}.gz" + with open(local_path, "rb") as f_in, gzip.open(gz_path, "wb") as f_out: + f_out.write(f_in.read()) + cmd = [ + "s3cmd", + "put", + "setacl", + "--acl-public", + shlex.quote(gz_path), + shlex.quote(s3_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True) + os.remove(gz_path) else: - s3.upload_file(Filename=local_path, Bucket=bucket, Key=key) - except ClientError as e: + cmd = [ + "s3cmd", + "put", + "setacl", + "--acl-public", + shlex.quote(local_path), + shlex.quote(s3_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + print( + f"Error uploading {local_path} to {s3_path} with s3cmd: {result.stderr}" + ) + raise RuntimeError(f"s3cmd upload failed: {result.stderr}") + except Exception as e: print(f"Error uploading {local_path} to {s3_path}: {e}") raise e diff --git a/flows/updaters/update_ena_taxonomy_extra.py b/flows/updaters/update_ena_taxonomy_extra.py index 1c5db72..9d3580e 100644 --- a/flows/updaters/update_ena_taxonomy_extra.py +++ b/flows/updaters/update_ena_taxonomy_extra.py @@ -115,13 +115,13 @@ def update_ena_jsonl(new_tax_ids: set[str], output_path: str, append: bool) -> N @task(log_prints=True) def fetch_s3_jsonl(s3_path: str, local_path: str, gz: bool = False) -> None: print(f"Fetching existing ENA JSONL file from {s3_path} to {local_path}") - fetch_from_s3(s3_path, local_path, gz=gz) + fetch_from_s3(s3_path, local_path) @task(log_prints=True) def upload_s3_jsonl(local_path: str, s3_path: str, gz: bool = False) -> None: print(f"Uploading updated ENA JSONL file from {local_path} to {s3_path}") - upload_to_s3(local_path, s3_path, gz=gz) + upload_to_s3(local_path, s3_path) @flow() From b83429da05b5f75b028855b9dc24e5077d5bb7a8 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 09:08:59 +0100 Subject: [PATCH 21/44] fix unzip behaviour --- flows/lib/utils.py | 11 ++++++----- flows/updaters/update_ena_taxonomy_extra.py | 9 ++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index bb2344e..0cf6516 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -631,11 +631,12 @@ def parse_s3_path(s3_path): # Download the file from S3 to the local path try: if gz: - s3.download_file(Bucket=bucket, Key=key, Filename=f"{local_path}.gz") - with open(local_path, "rb") as f_in: - with gzip.open(f"{local_path}.gz", "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - os.remove(local_path) + gz_path = f"{local_path}.gz" + s3.download_file(Bucket=bucket, Key=key, Filename=gz_path) + # Unzip gz_path to local_path, then remove gz_path + with gzip.open(gz_path, "rb") as f_in, open(local_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + os.remove(gz_path) else: s3.download_file(Bucket=bucket, Key=key, Filename=local_path) except ClientError as e: diff --git a/flows/updaters/update_ena_taxonomy_extra.py b/flows/updaters/update_ena_taxonomy_extra.py index 9d3580e..de1fb20 100644 --- a/flows/updaters/update_ena_taxonomy_extra.py +++ b/flows/updaters/update_ena_taxonomy_extra.py @@ -113,13 +113,13 @@ def update_ena_jsonl(new_tax_ids: set[str], output_path: str, append: bool) -> N @task(log_prints=True) -def fetch_s3_jsonl(s3_path: str, local_path: str, gz: bool = False) -> None: +def fetch_s3_jsonl(s3_path: str, local_path: str) -> None: print(f"Fetching existing ENA JSONL file from {s3_path} to {local_path}") fetch_from_s3(s3_path, local_path) @task(log_prints=True) -def upload_s3_jsonl(local_path: str, s3_path: str, gz: bool = False) -> None: +def upload_s3_jsonl(local_path: str, s3_path: str) -> None: print(f"Uploading updated ENA JSONL file from {local_path} to {s3_path}") upload_to_s3(local_path, s3_path) @@ -143,7 +143,7 @@ def update_ena_taxonomy_extra( if append: # 2. fetch jsonl file from s3 if s3_path is provided if s3_path: - fetch_s3_jsonl(s3_path, output_path, gz=True) + fetch_s3_jsonl(s3_path, output_path) # 3. read existing IDs from local JSONL file add_jsonl_tax_ids(output_path, existing_tax_ids) # 4. fetch list of new IDs from ENA API @@ -152,8 +152,7 @@ def update_ena_taxonomy_extra( update_ena_jsonl(new_tax_ids, output_path, append) # 6. upload updated JSONL file to s3 if s3_path is provided if s3_path: - # gzip the file first - upload_s3_jsonl(output_path, s3_path, gz=True) + upload_s3_jsonl(output_path, s3_path) status = len(new_tax_ids) == 0 From 0e0555066f5a9fc8540ae226d1116de489eede30 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 09:19:35 +0100 Subject: [PATCH 22/44] include directories in file exists check --- flows/updaters/update_genomehubs_taxonomy.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/flows/updaters/update_genomehubs_taxonomy.py b/flows/updaters/update_genomehubs_taxonomy.py index ce4a112..964e40e 100644 --- a/flows/updaters/update_genomehubs_taxonomy.py +++ b/flows/updaters/update_genomehubs_taxonomy.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import os import subprocess import sys from collections import defaultdict @@ -106,17 +107,16 @@ def update_genomehubs_taxonomy( # 1. parse input config yaml file_paths = read_input_config(input_path) - # 2. check files exist locally + # 2. check files exist locally (file or directory) + for key, paths in file_paths.items(): if "input" in paths: - try: - with open(paths["input"], "r"): - pass - except FileNotFoundError: - print(f"Error: {paths['input']} not found") + input_path = paths["input"] + if not os.path.exists(input_path): + print(f"Error: {input_path} not found") exit() - except Exception as e: - print(f"Error reading {paths['input']}: {e}") + if not (os.path.isfile(input_path) or os.path.isdir(input_path)): + print(f"Error: {input_path} is not a file or directory") exit() # 3. run blobtk to collate and filter taxonomies run_blobtk_taxonomy(root_taxid, input_path, output_path) From c8d031e7a1badac2cab1f09ba540a52b0e77b3a4 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 09:23:06 +0100 Subject: [PATCH 23/44] update filenames in prefect.yaml --- flows/prefect.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index dbbc92b..94fac7b 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -163,7 +163,7 @@ deployments: - update.ena.taxonomy.finished parameters: root_taxid: "2759" - input_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota_taxonomy_input.yaml" - output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs_taxonomy_eukaryota.jsonl" - s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs_taxonomy_eukaryota.jsonl" + input_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota-taxonomy-input.yaml" + output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs-taxonomy-eukaryota.jsonl" + s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs-taxonomy-eukaryota.jsonl" work_pool: *goat_data_work_pool From 442a8c8b19158f7218d5fe215ebdc874f046f0d1 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 09:28:59 +0100 Subject: [PATCH 24/44] fix variable name conflict --- flows/updaters/update_genomehubs_taxonomy.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/flows/updaters/update_genomehubs_taxonomy.py b/flows/updaters/update_genomehubs_taxonomy.py index 964e40e..87d6b10 100644 --- a/flows/updaters/update_genomehubs_taxonomy.py +++ b/flows/updaters/update_genomehubs_taxonomy.py @@ -111,12 +111,12 @@ def update_genomehubs_taxonomy( for key, paths in file_paths.items(): if "input" in paths: - input_path = paths["input"] - if not os.path.exists(input_path): - print(f"Error: {input_path} not found") + taxonomy_path = paths["input"] + if not os.path.exists(taxonomy_path): + print(f"Error: {taxonomy_path} not found") exit() - if not (os.path.isfile(input_path) or os.path.isdir(input_path)): - print(f"Error: {input_path} is not a file or directory") + if not (os.path.isfile(taxonomy_path) or os.path.isdir(taxonomy_path)): + print(f"Error: {taxonomy_path} is not a file or directory") exit() # 3. run blobtk to collate and filter taxonomies run_blobtk_taxonomy(root_taxid, input_path, output_path) From d0e7b0292572b1af58f741c7d60e14c583d54e8d Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 09:32:22 +0100 Subject: [PATCH 25/44] fix case for output flag --- flows/updaters/update_genomehubs_taxonomy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flows/updaters/update_genomehubs_taxonomy.py b/flows/updaters/update_genomehubs_taxonomy.py index 87d6b10..4fb714b 100644 --- a/flows/updaters/update_genomehubs_taxonomy.py +++ b/flows/updaters/update_genomehubs_taxonomy.py @@ -66,7 +66,7 @@ def run_blobtk_taxonomy(root_taxid: str, input_path: str, output_path: str) -> N "taxonomy", "-c", input_path, - "-o", + "-O", output_path, "-r", root_taxid, From c46de06470223d94309fdb56d5c73f35d4f164ac Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 10:00:07 +0100 Subject: [PATCH 26/44] write blobtk output to prefect logs --- flows/updaters/update_genomehubs_taxonomy.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/flows/updaters/update_genomehubs_taxonomy.py b/flows/updaters/update_genomehubs_taxonomy.py index 4fb714b..2a27b0f 100644 --- a/flows/updaters/update_genomehubs_taxonomy.py +++ b/flows/updaters/update_genomehubs_taxonomy.py @@ -73,7 +73,19 @@ def run_blobtk_taxonomy(root_taxid: str, input_path: str, output_path: str) -> N ] print(f"Running command: {' '.join(cmd)}") try: - subprocess.run(cmd, check=True) + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + for line in process.stdout: + print(line, end="") + process.wait() + if process.returncode != 0: + print(f"Command failed with exit code {process.returncode}") + exit() except Exception as e: print(f"Error running blobtk taxonomy: {e}") exit() From e8fa6ca24db5f47ae297762e7a807d6e3d71ce32 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 10:53:50 +0100 Subject: [PATCH 27/44] don't append nodes.jsonl to output path --- flows/updaters/update_genomehubs_taxonomy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flows/updaters/update_genomehubs_taxonomy.py b/flows/updaters/update_genomehubs_taxonomy.py index 2a27b0f..fa1fa11 100644 --- a/flows/updaters/update_genomehubs_taxonomy.py +++ b/flows/updaters/update_genomehubs_taxonomy.py @@ -135,16 +135,16 @@ def update_genomehubs_taxonomy( # 4. upload updated JSONL file to s3 if s3_path is provided if s3_path: - upload_s3_file(f"{output_path}/nodes.jsonl", s3_path) + upload_s3_file(f"{output_path}", s3_path) # 5. count lines in output file line_count = 0 try: - with open(f"{output_path}/nodes.jsonl", "r") as f: + with open(f"{output_path}", "r") as f: line_count = sum(1 for _ in f) print(f"Output file has {line_count} lines") except Exception as e: - print(f"Error reading {output_path}/nodes.jsonl: {e}") + print(f"Error reading {output_path}: {e}") exit() emit_event( From 73a8361e535489dda9d31434a6aa91f2ee1e0007 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 10:55:40 +0100 Subject: [PATCH 28/44] gzip remote jsonl --- flows/prefect.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index 94fac7b..4a773b3 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -154,7 +154,7 @@ deployments: # Local path to save the GenomeHubs taxonomy JSONL file output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota/nodes.jsonl" # The S3 path to save the GenomeHubs taxonomy JSONL file - s3_path: "s3://goat/resources/taxonomy/genomehubs/eukaryota-nodes.jsonl" + s3_path: "s3://goat/resources/taxonomy/genomehubs/eukaryota-nodes.jsonl.gz" triggers: - enabled: true match: From 3b9ee4d467a87def3aba7d62aaff57780d4caf61 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 13:50:57 +0100 Subject: [PATCH 29/44] add step to sort ena jsonl by lineage --- flows/updaters/update_ena_taxonomy_extra.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/flows/updaters/update_ena_taxonomy_extra.py b/flows/updaters/update_ena_taxonomy_extra.py index de1fb20..107822e 100644 --- a/flows/updaters/update_ena_taxonomy_extra.py +++ b/flows/updaters/update_ena_taxonomy_extra.py @@ -112,6 +112,23 @@ def update_ena_jsonl(new_tax_ids: set[str], output_path: str, append: bool) -> N print(f"Error updating {output_path}: {e}") +@task(log_prints=True) +def sort_jsonl_by_lineage(jsonl_path: str) -> None: + print(f"Sorting JSONL file by lineage at {jsonl_path}") + sorted_path = f"{jsonl_path}.sorted" + try: + with open(jsonl_path, "r") as f: + data = [json.loads(line) for line in f] + data.sort(key=lambda x: x.get("lineage", "")) + with open(sorted_path, "w") as f_out: + for entry in data: + f_out.write(json.dumps(entry) + "\n") + os.replace(sorted_path, jsonl_path) + except Exception as e: + print(f"Error sorting {jsonl_path}: {e}") + exit() + + @task(log_prints=True) def fetch_s3_jsonl(s3_path: str, local_path: str) -> None: print(f"Fetching existing ENA JSONL file from {s3_path} to {local_path}") @@ -150,7 +167,9 @@ def update_ena_taxonomy_extra( new_tax_ids = get_ena_api_new_taxids(root_taxid, existing_tax_ids) # 5. fetch details for new IDs from ENA API and save to JSONL file update_ena_jsonl(new_tax_ids, output_path, append) - # 6. upload updated JSONL file to s3 if s3_path is provided + # 6. sort the JSONL file by lineage + sort_jsonl_by_lineage(output_path) + # 7. upload updated JSONL file to s3 if s3_path is provided if s3_path: upload_s3_jsonl(output_path, s3_path) From e21380f93e4f6fb72ae75a2c6eddf7272074d907 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Mon, 15 Sep 2025 13:58:37 +0100 Subject: [PATCH 30/44] gzip jsonl files --- flows/prefect.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index 4a773b3..d12d323 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -126,7 +126,7 @@ deployments: # Local path to save the ENA taxonomy JSONL file output_path: "/home/ubuntu/tmp/test/ena-taxonomy/ena-taxonomy-extra.jsonl" # The S3 path to save the ENA taxonomy JSONL file - s3_path: "s3://goat/resources/taxonomy/ena/ena-taxonomy-extra.jsonl" + s3_path: "s3://goat/resources/taxonomy/ena/ena-taxonomy-extra.jsonl.gz" # A flag to indicate if the flow should append to the existing JSONL file append: true triggers: @@ -165,5 +165,5 @@ deployments: root_taxid: "2759" input_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota-taxonomy-input.yaml" output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs-taxonomy-eukaryota.jsonl" - s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs-taxonomy-eukaryota.jsonl" + s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs-taxonomy-eukaryota.jsonl.gz" work_pool: *goat_data_work_pool From d40769134f4ecf3f827cb3c63c50fd3862d724f4 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Tue, 16 Sep 2025 08:37:53 +0100 Subject: [PATCH 31/44] update genomehubs taxonomy paths --- flows/prefect.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index d12d323..8ce5abe 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -154,7 +154,7 @@ deployments: # Local path to save the GenomeHubs taxonomy JSONL file output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota/nodes.jsonl" # The S3 path to save the GenomeHubs taxonomy JSONL file - s3_path: "s3://goat/resources/taxonomy/genomehubs/eukaryota-nodes.jsonl.gz" + s3_path: "s3://goat/resources/taxonomy/genomehubs/eukaryota/nodes.jsonl.gz" triggers: - enabled: true match: @@ -164,6 +164,6 @@ deployments: parameters: root_taxid: "2759" input_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota-taxonomy-input.yaml" - output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs-taxonomy-eukaryota.jsonl" - s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs-taxonomy-eukaryota.jsonl.gz" + output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs-taxonomy/eukaryota.jsonl" + s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs-taxonomy/eukaryota.jsonl.gz" work_pool: *goat_data_work_pool From eed01ca0e340a9ee24904345e0e3452a62a2f72c Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Tue, 16 Sep 2025 13:28:34 +0100 Subject: [PATCH 32/44] set config to use main branch --- flows/prefect.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index 8ce5abe..138e74c 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -17,7 +17,7 @@ pull: # Pull the genomehubs/data repository to get the latest flows id: clone-data repository: https://github.com/genomehubs/data.git - branch: feature/pipeline-updates + branch: main definitions: work_pools: From fec03613897d16f5033c39a8b16b08920780d3df Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Tue, 16 Sep 2025 14:15:44 +0100 Subject: [PATCH 33/44] address sourcery issues --- flows/lib/fetch_genomehubs_target_list.py | 4 +- flows/lib/index_assembly_features.py | 14 +-- flows/lib/utils.py | 95 ++++++++++++-------- flows/lib/validate_file_pair.py | 5 ++ flows/parsers/parse_ncbi_assemblies.py | 2 + flows/updaters/api/api_config.py | 12 +-- flows/updaters/update_boat_config.py | 22 ++++- flows/updaters/update_ena_taxonomy_extra.py | 1 + flows/updaters/update_genomehubs_taxonomy.py | 9 +- flows/updaters/update_ncbi_datasets.py | 3 + flows/updaters/update_ncbi_taxonomy.py | 8 +- flows/updaters/update_nhm.py | 2 + flows/updaters/update_ott_taxonomy.py | 8 +- flows/updaters/update_tolid_prefixes.py | 10 ++- scripts/bulk_goat_lookup.py | 2 +- 15 files changed, 135 insertions(+), 62 deletions(-) diff --git a/flows/lib/fetch_genomehubs_target_list.py b/flows/lib/fetch_genomehubs_target_list.py index f0872c9..70fb62a 100644 --- a/flows/lib/fetch_genomehubs_target_list.py +++ b/flows/lib/fetch_genomehubs_target_list.py @@ -52,7 +52,9 @@ def fetch_genomehubs_list_file( print(f"Fetching records from {url}") # Fetch the list of target records - response = requests.get(url, headers={"Accept": "text/tab-separated-values"}) + response = requests.get( + url, headers={"Accept": "text/tab-separated-values"}, timeout=300 + ) response.raise_for_status() records = response.text # write records to file diff --git a/flows/lib/index_assembly_features.py b/flows/lib/index_assembly_features.py index 886fc08..65c59e0 100644 --- a/flows/lib/index_assembly_features.py +++ b/flows/lib/index_assembly_features.py @@ -56,7 +56,7 @@ def list_busco_lineages(assembly_id: str, work_dir: str) -> list: ) url = f"{goat_api}/search?{queryString}" # Fetch the list of BUSCO lineages - response = requests.get(url) + response = requests.get(url, timeout=300) response.raise_for_status() return [ get_genomehubs_attribute_value(result, "odb10_lineage") @@ -75,9 +75,9 @@ def find_busco_files(assembly_id, busco_lineages, work_dir, http_path): for path in busco_http_path: if busco_file := find_http_file(path, f"{lineage}/full_table.tsv"): local_file = f"{busco_work_dir}/{lineage}_full_table.tsv" - requests.get(busco_file).content + requests.get(busco_file, timeout=300).content with open(local_file, "wb") as file: - file.write(requests.get(busco_file).content) + file.write(requests.get(busco_file, timeout=300).content) busco_files.append(local_file) break return busco_files @@ -87,7 +87,7 @@ def find_busco_files(assembly_id, busco_lineages, work_dir, http_path): def find_blobtoolkit_files(assembly_id, work_dir, http_path): blobtoolkit_api_url = "https://blobtoolkit.genomehubs.org/api/v1" blobtoolkit_search_url = f"{blobtoolkit_api_url}/search/{assembly_id}" - response = requests.get(blobtoolkit_search_url) + response = requests.get(blobtoolkit_search_url, timeout=300) response.raise_for_status() results = response.json() if not results: @@ -107,7 +107,7 @@ def find_blobtoolkit_files(assembly_id, work_dir, http_path): return [] # fetch the full dataset metadata blobtoolkit_metadata_url = f"{blobtoolkit_api_url}/dataset/id/{dataset_id}" - response = requests.get(blobtoolkit_metadata_url) + response = requests.get(blobtoolkit_metadata_url, timeout=300) response.raise_for_status() metadata = response.json() print(metadata) @@ -132,8 +132,8 @@ def index_assembly_features( # if snapshot_exists(s3_path, assembly_id, "feature"): # return taxon_id # find_files(assembly_id, work_dir, s3_path) - busco_lineages = list_busco_lineages(assembly_id, work_dir) - busco_files = find_busco_files(assembly_id, busco_lineages, work_dir, http_path) + # busco_lineages = list_busco_lineages(assembly_id, work_dir) + # busco_files = find_busco_files(assembly_id, busco_lineages, work_dir, http_path) blobtoolkit_files = find_blobtoolkit_files(assembly_id, work_dir, http_path) print(blobtoolkit_files) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index 0cf6516..8f566c6 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -4,7 +4,7 @@ import gzip import hashlib import os -import shlex +import re import shutil import subprocess from argparse import Action @@ -559,7 +559,7 @@ def find_http_file(http_path: str, filename: str) -> str: Returns: str: Path to the file. """ - response = requests.get(f"{http_path}/{filename}") + response = requests.get(f"{http_path}/{filename}", timeout=300) return f"{http_path}/{filename}" if response.status_code == 200 else None @@ -604,6 +604,17 @@ def find_s3_file(s3_path: list, filename: str) -> str: return None +def is_safe_path(path: str) -> bool: + # Only allow alphanumeric, dash, underscore, dot, slash, and colon (for s3) + return bool(re.match(r"^[\w\-/.:]+$", path)) + + +def parse_s3_path(s3_path): + # Extract bucket name and key from the S3 path + bucket, key = s3_path.removeprefix("s3://").split("/", 1) + return bucket, key + + def fetch_from_s3(s3_path: str, local_path: str, gz: bool = False) -> None: """ Fetch a file from S3. @@ -618,11 +629,6 @@ def fetch_from_s3(s3_path: str, local_path: str, gz: bool = False) -> None: """ s3 = boto3.client("s3") - # Extract bucket name and key from the S3 path - def parse_s3_path(s3_path): - bucket, key = s3_path.removeprefix("s3://").split("/", 1) - return bucket, key - bucket, key = parse_s3_path(s3_path) if s3_path.endswith(".gz") and not local_path.endswith(".gz"): @@ -657,6 +663,11 @@ def upload_to_s3(local_path: str, s3_path: str, gz: bool = False) -> None: None: This function uploads the local file to S3. """ + if not is_safe_path(local_path): + raise ValueError(f"Unsafe local path: {local_path}") + if not is_safe_path(s3_path): + raise ValueError(f"Unsafe s3 path: {s3_path}") + if s3_path.endswith(".gz") and not local_path.endswith(".gz"): gz = True @@ -665,24 +676,26 @@ def upload_to_s3(local_path: str, s3_path: str, gz: bool = False) -> None: gz_path = f"{local_path}.gz" with open(local_path, "rb") as f_in, gzip.open(gz_path, "wb") as f_out: f_out.write(f_in.read()) + # use s3cmd for uploads due to issues with boto3 and large files cmd = [ "s3cmd", "put", "setacl", "--acl-public", - shlex.quote(gz_path), - shlex.quote(s3_path), + gz_path, + s3_path, ] result = subprocess.run(cmd, capture_output=True, text=True) os.remove(gz_path) else: + # use s3cmd for uploads due to issues with boto3 and large files cmd = [ "s3cmd", "put", "setacl", "--acl-public", - shlex.quote(local_path), - shlex.quote(s3_path), + local_path, + s3_path, ] result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: @@ -752,27 +765,35 @@ def last_modified_git_remote(http_path: str) -> Optional[str]: Returns: str: Last modified date of the file. """ - project_path = http_path.removeprefix("https://gitlab.com/") - project_path = project_path.removesuffix(".git") - parts = project_path.split("/") - project = "%2F".join(parts[:2]) - ref = parts[4] - file = "%2F".join(parts[5:]).split("?")[0] - project_path = project_path.replace("/", "%2F") - api_url = ( - f"https://gitlab.com/api/v4/projects/{project}/repository/commits" - f"?ref_name={ref}&path={file}&per_page=1" - ) - response = requests.get(api_url) - if response.status_code == 200: - if commits := response.json(): - if committed_date := commits[0].get("committed_date", None): - dt = parser.isoparse(committed_date) - return int(dt.timestamp()) - else: - response = requests.head(http_path, allow_redirects=True) + try: + if not http_path.startswith("https://gitlab.com/"): + print(f"Malformed GitLab URL (missing prefix): {http_path}") + return None + project_path = http_path.removeprefix("https://gitlab.com/") + project_path = project_path.removesuffix(".git") + parts = project_path.split("/") + if len(parts) < 6: + print(f"Malformed GitLab URL (not enough parts): {http_path}") + return None + project = "%2F".join(parts[:2]) + ref = parts[4] + file = "%2F".join(parts[5:]).split("?")[0] + api_url = ( + f"https://gitlab.com/api/v4/projects/{project}/repository/commits" + f"?ref_name={ref}&path={file}&per_page=1" + ) + response = requests.get(api_url, timeout=300) if response.status_code == 200: - return response.headers.get("Last-Modified", None) + commits = response.json() + if commits and commits[0].get("committed_date"): + dt = parser.isoparse(commits[0]["committed_date"]) + return int(dt.timestamp()) + else: + response = requests.head(http_path, allow_redirects=True, timeout=300) + if response.status_code == 200: + return response.headers.get("Last-Modified", None) + except Exception as e: + print(f"Error parsing GitLab URL or fetching commit info: {e}") return None @@ -788,7 +809,7 @@ def last_modified_http(http_path: str) -> Optional[str]: """ if "gitlab.com" in http_path: return last_modified_git_remote(http_path) - response = requests.head(http_path, allow_redirects=True) + response = requests.head(http_path, allow_redirects=True, timeout=300) if response.status_code == 200: if last_modified := response.headers.get("Last-Modified", None): dt = parser.parse(last_modified) @@ -809,11 +830,6 @@ def last_modified_s3(s3_path: str) -> Optional[str]: """ s3 = boto3.client("s3") - # Extract bucket name and key from the S3 path - def parse_s3_path(s3_path): - bucket, key = s3_path.removeprefix("s3://").split("/", 1) - return bucket, key - bucket, key = parse_s3_path(s3_path) # Return None if the remote file does not exist @@ -825,7 +841,7 @@ def parse_s3_path(s3_path): return None -def last_modified(local_path: str) -> Optional[str]: +def last_modified(local_path: str) -> Optional[int]: """ Get the last modified date of a local file. @@ -833,7 +849,8 @@ def last_modified(local_path: str) -> Optional[str]: local_path (str): Path to the local file. Returns: - str: Last modified date of the file. + Optional[int]: Last modified date of the file as a Unix timestamp, or None if + the file does not exist. """ if not os.path.exists(local_path): return None diff --git a/flows/lib/validate_file_pair.py b/flows/lib/validate_file_pair.py index 1ada5f3..741c59b 100644 --- a/flows/lib/validate_file_pair.py +++ b/flows/lib/validate_file_pair.py @@ -43,9 +43,14 @@ def validate_yaml_file( Returns: bool: True if the YAML file is valid, False otherwise. """ + if not utils.is_safe_path(yaml_path): + raise ValueError(f"Unsafe YAML path: {yaml_path}") + # Validate the YAML file using blobtk validate cmd = ["blobtk", "validate", "-g", yaml_path] if taxdump_path is not None: + if not utils.is_safe_path(taxdump_path): + raise ValueError(f"Unsafe taxdump path: {taxdump_path}") cmd.extend( [ "-t", diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index 1841ed3..f2e2864 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -53,6 +53,8 @@ def fetch_ncbi_datasets_sequences( Yields: dict: The sequence report data as a JSON object, one line at a time. """ + if not utils.is_safe_path(accession): + raise ValueError(f"Unsafe accession: {accession}") result = subprocess.run( [ "datasets", diff --git a/flows/updaters/api/api_config.py b/flows/updaters/api/api_config.py index 4409306..84be72e 100644 --- a/flows/updaters/api/api_config.py +++ b/flows/updaters/api/api_config.py @@ -24,7 +24,7 @@ def vgl_url_opener(**kwargs): "https://raw.githubusercontent.com/vgl-hub/genome-portal/" "master/_data/table_tracker.yml" ) - return requests.get(vgl_url, stream=True) + return requests.get(vgl_url, stream=True, timeout=300) def vgl_hub_count_handler(r_text): @@ -81,7 +81,7 @@ def vgl_row_handler(r_text, fieldnames, **kwargs): def nhm_url_opener(**kwargs): - return requests.post(nhm_url, headers=nhm_headers, json=nhm_post_data) + return requests.post(nhm_url, headers=nhm_headers, json=nhm_post_data, timeout=300) def nhm_api_count_handler(r_text): @@ -93,7 +93,9 @@ def nhm_row_handler(fieldnames, **kwargs): nhm_post_data_after = nhm_post_data result = [] while True: - response = requests.post(nhm_url, headers=nhm_headers, json=nhm_post_data_after) + response = requests.post( + nhm_url, headers=nhm_headers, json=nhm_post_data_after, timeout=300 + ) r = response.json() dl = r["result"]["records"] for species in dl: @@ -167,7 +169,7 @@ def nhm_row_handler(fieldnames, **kwargs): def sts_url_opener(token): return requests.get( - sts_url, headers={"Token": token, "Project": "ALL"}, verify=False + sts_url, headers={"Token": token, "Project": "ALL"}, verify=False, timeout=300 ) @@ -185,7 +187,7 @@ def sts_row_handler(result_count, fieldnames, token, **kwargs): url = f"{sts_url}?page={page}&page_size={page_size}" r = requests.get( - url, headers={"Token": token, "Project": "ALL"}, verify=False + url, headers={"Token": token, "Project": "ALL"}, verify=False, timeout=300 ).json() dl = r["data"]["list"] diff --git a/flows/updaters/update_boat_config.py b/flows/updaters/update_boat_config.py index a369013..a1555fd 100644 --- a/flows/updaters/update_boat_config.py +++ b/flows/updaters/update_boat_config.py @@ -26,10 +26,16 @@ parse_args, required, ) -from flows.lib.utils import parse_tsv +from flows.lib.utils import is_safe_path, parse_tsv def taxon_id_to_ssh_path(ssh_host, taxon_id, assembly_name): + + if not is_safe_path(ssh_host): + raise ValueError(f"Unsafe ssh host: {ssh_host}") + if not is_safe_path(taxon_id): + raise ValueError(f"Unsafe taxon_id: {taxon_id}") + command = [ "ssh", ssh_host, @@ -66,6 +72,10 @@ def taxon_id_to_ssh_path(ssh_host, taxon_id, assembly_name): def lookup_buscos(ssh_host, file_path): if "lustre" in file_path: + if not is_safe_path(ssh_host): + raise ValueError(f"Unsafe ssh host: {ssh_host}") + if not is_safe_path(file_path): + raise ValueError(f"Unsafe file path: {file_path}") command = [ "ssh", @@ -90,6 +100,12 @@ def assembly_id_to_busco_sets(alt_host, assembly_id): Fetch the alternative path for an assembly ID from the alt host. This function uses SSH to run a command on the alt host to get the path. """ + + if not is_safe_path(alt_host): + raise ValueError(f"Unsafe alt host: {alt_host}") + if not is_safe_path(assembly_id): + raise ValueError(f"Unsafe assembly_id: {assembly_id}") + # find file on alt_host command = [ "ssh", @@ -115,7 +131,7 @@ def assembly_id_to_busco_sets(alt_host, assembly_id): busco_url = ( f"https://busco.cog.sanger.ac.uk/{assembly_id}/{lineage}/full_table.tsv" ) - response = requests.get(busco_url) + response = requests.get(busco_url, timeout=300) if response.status_code == 200: busco_sets.append(lineage) return f"https://busco.cog.sanger.ac.uk/{assembly_id}", busco_sets @@ -186,7 +202,7 @@ def fetch_goat_results(root_taxid): # fetch query_url with accept header tsv. use python module requests headers = {"Accept": "text/tab-separated-values"} - response = requests.get(query_url, headers=headers) + response = requests.get(query_url, headers=headers, timeout=300) if response.status_code != 200: raise RuntimeError( f"Error fetching BoaT config info: {response.status_code} {response.text}" diff --git a/flows/updaters/update_ena_taxonomy_extra.py b/flows/updaters/update_ena_taxonomy_extra.py index 107822e..061dde1 100644 --- a/flows/updaters/update_ena_taxonomy_extra.py +++ b/flows/updaters/update_ena_taxonomy_extra.py @@ -10,6 +10,7 @@ if __name__ == "__main__" and __package__ is None: sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + # sourcery skip: avoid-builtin-shadow __package__ = "flows" from flows.lib.conditional_import import emit_event, flow, task diff --git a/flows/updaters/update_genomehubs_taxonomy.py b/flows/updaters/update_genomehubs_taxonomy.py index fa1fa11..130a408 100644 --- a/flows/updaters/update_genomehubs_taxonomy.py +++ b/flows/updaters/update_genomehubs_taxonomy.py @@ -10,6 +10,7 @@ if __name__ == "__main__" and __package__ is None: sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + # sourcery skip: avoid-builtin-shadow __package__ = "flows" from flows.lib.conditional_import import emit_event, flow, task @@ -22,7 +23,7 @@ parse_args, required, ) -from flows.lib.utils import fetch_from_s3, upload_to_s3 +from flows.lib.utils import fetch_from_s3, is_safe_path, upload_to_s3 def get_file_paths_from_config(config: dict, file_paths: dict) -> dict: @@ -61,6 +62,12 @@ def read_input_config(input_path: str) -> dict: @task(log_prints=True) def run_blobtk_taxonomy(root_taxid: str, input_path: str, output_path: str) -> None: print(f"Running blobtk taxonomy with root taxid {root_taxid}") + if not is_safe_path(root_taxid): + raise ValueError(f"Unsafe root taxid: {root_taxid}") + if not is_safe_path(input_path): + raise ValueError(f"Unsafe input path: {input_path}") + if not is_safe_path(output_path): + raise ValueError(f"Unsafe output path: {output_path}") cmd = [ "blobtk", "taxonomy", diff --git a/flows/updaters/update_ncbi_datasets.py b/flows/updaters/update_ncbi_datasets.py index b864b6a..989af38 100644 --- a/flows/updaters/update_ncbi_datasets.py +++ b/flows/updaters/update_ncbi_datasets.py @@ -11,6 +11,7 @@ if __name__ == "__main__" and __package__ is None: sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + # sourcery skip: avoid-builtin-shadow __package__ = "flows" from flows.lib.conditional_import import emit_event, flow, task @@ -76,6 +77,8 @@ def fetch_ncbi_datasets_summary( "42452", ] for taxid in taxids: + if not taxid.isdigit(): + raise ValueError(f"Invalid taxid: {taxid}") # datasets summary for the root taxID command = [ "datasets", diff --git a/flows/updaters/update_ncbi_taxonomy.py b/flows/updaters/update_ncbi_taxonomy.py index 3b3d8e7..f10b589 100644 --- a/flows/updaters/update_ncbi_taxonomy.py +++ b/flows/updaters/update_ncbi_taxonomy.py @@ -7,11 +7,12 @@ if __name__ == "__main__" and __package__ is None: sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + # sourcery skip: avoid-builtin-shadow __package__ = "flows" from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import OUTPUT_PATH, parse_args, required -from flows.lib.utils import generate_md5, is_local_file_current_http +from flows.lib.utils import generate_md5, is_local_file_current_http, is_safe_path @task(retries=2, retry_delay_seconds=2, log_prints=True) @@ -30,6 +31,10 @@ def fetch_ncbi_taxonomy( bool: True if the fetched file matches the remote version, False otherwise. """ # create local_path if it doesn't exist + if not is_safe_path(local_path): + raise ValueError(f"Unsafe local path: {local_path}") + if not is_safe_path(http_path): + raise ValueError(f"Unsafe HTTP path: {http_path}") os.makedirs(local_path, exist_ok=True) local_gz_file = f"{local_path}/taxdump.tar.gz" # Fetch the remote file @@ -121,3 +126,4 @@ def update_ncbi_taxonomy(output_path: str) -> None: ) update_ncbi_taxonomy(**vars(args)) + update_ncbi_taxonomy(**vars(args)) diff --git a/flows/updaters/update_nhm.py b/flows/updaters/update_nhm.py index 4f5a7f4..de2275e 100644 --- a/flows/updaters/update_nhm.py +++ b/flows/updaters/update_nhm.py @@ -5,6 +5,7 @@ if __name__ == "__main__" and __package__ is None: sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + # sourcery skip: avoid-builtin-shadow __package__ = "flows" from flows.lib.conditional_import import emit_event, flow, task @@ -72,3 +73,4 @@ def update_nhm(output_path: str, min_records: int) -> None: ) update_nhm(**vars(args)) + update_nhm(**vars(args)) diff --git a/flows/updaters/update_ott_taxonomy.py b/flows/updaters/update_ott_taxonomy.py index 08407ed..f5056e0 100644 --- a/flows/updaters/update_ott_taxonomy.py +++ b/flows/updaters/update_ott_taxonomy.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -# sourcery skip: avoid-builtin-shadow import os import subprocess import sys @@ -8,13 +7,14 @@ if __name__ == "__main__" and __package__ is None: sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + # sourcery skip: avoid-builtin-shadow __package__ = "flows" import json from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import OUTPUT_PATH, parse_args, required -from flows.lib.utils import is_local_file_current_http +from flows.lib.utils import is_local_file_current_http, is_safe_path @task(retries=2, retry_delay_seconds=2, log_prints=True) @@ -32,6 +32,10 @@ def fetch_ott_taxonomy( Returns: bool: True if the fetched file matches the remote version, False otherwise. """ + if not is_safe_path(local_path): + raise ValueError(f"Unsafe local path: {local_path}") + if not is_safe_path(http_path): + raise ValueError(f"Unsafe HTTP path: {http_path}") # create local_path if it doesn't exist os.makedirs(local_path, exist_ok=True) local_gz_file = f"{local_path}/ott.tar.gz" diff --git a/flows/updaters/update_tolid_prefixes.py b/flows/updaters/update_tolid_prefixes.py index 61386ce..51ad9ab 100644 --- a/flows/updaters/update_tolid_prefixes.py +++ b/flows/updaters/update_tolid_prefixes.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -# sourcery skip: avoid-builtin-shadow import os import subprocess import sys @@ -8,11 +7,12 @@ if __name__ == "__main__" and __package__ is None: sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) + # sourcery skip: avoid-builtin-shadow __package__ = "flows" from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import OUTPUT_PATH, parse_args, required -from flows.lib.utils import is_local_file_current_http +from flows.lib.utils import is_local_file_current_http, is_safe_path @task(retries=2, retry_delay_seconds=2, log_prints=True) @@ -34,6 +34,12 @@ def fetch_tolid_prefixes( Returns: bool: True if the fetched file matches the remote version, False otherwise. """ + + if not is_safe_path(local_path): + raise ValueError(f"Unsafe local path: {local_path}") + if not is_safe_path(http_path): + raise ValueError(f"Unsafe HTTP path: {http_path}") + # create local_path if it doesn't exist os.makedirs(local_path, exist_ok=True) local_file = f"{local_path}/tolids.txt" diff --git a/scripts/bulk_goat_lookup.py b/scripts/bulk_goat_lookup.py index 1d7a109..f763cfc 100755 --- a/scripts/bulk_goat_lookup.py +++ b/scripts/bulk_goat_lookup.py @@ -91,7 +91,7 @@ def main(): f"&ranks={args.ranks.replace(',', '%2C')}" ) response = requests.get( - api_url, headers={"Accept": "text/tab-separated-values"} + api_url, headers={"Accept": "text/tab-separated-values"}, timeout=300 ) if response.status_code == 200: parse_results( From e68d6d7c9cf06f89b1427957f22efb6c3e29a52f Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 09:09:39 +0100 Subject: [PATCH 34/44] use module syntax to allow relative imports --- flows/README.md | 11 ++++----- flows/feature_parsers/__init__.py | 0 flows/feature_parsers/args.py | 9 ------- .../parse_blobtoolkit_assembly.py | 13 +--------- flows/feature_parsers/parse_busco_features.py | 12 ---------- flows/feature_parsers/register.py | 2 -- flows/lib/__init__.py | 0 flows/lib/conditional_import.py | 2 -- flows/lib/fetch_genomehubs_target_list.py | 2 -- flows/lib/fetch_previous_file_pair.py | 2 -- flows/lib/for_each_record.py | 2 -- flows/lib/index_assembly_features.py | 2 -- flows/lib/process_features.py | 6 ----- flows/lib/shared_args.py | 1 - flows/lib/shared_tasks.py | 2 -- flows/lib/utils.py | 24 +++++++++++-------- flows/lib/validate_file_pair.py | 2 -- flows/lib/wrapper_fetch_parse_validate.py | 9 ------- flows/parsers/__init__.py | 0 flows/parsers/args.py | 9 ------- flows/parsers/parse_ncbi_assemblies.py | 9 ------- flows/parsers/parse_refseq_organelles.py | 10 -------- flows/parsers/parse_sequencing_status.py | 10 -------- flows/parsers/parse_skip_parsing.py | 9 ------- flows/parsers/register.py | 2 -- flows/updaters/__init__.py | 0 flows/updaters/api/__init__.py | 0 flows/updaters/update_boat_config.py | 10 -------- flows/updaters/update_ena_taxonomy_extra.py | 9 ------- flows/updaters/update_genomehubs_taxonomy.py | 9 ------- flows/updaters/update_ncbi_datasets.py | 9 ------- flows/updaters/update_ncbi_taxonomy.py | 9 ------- flows/updaters/update_nhm.py | 10 -------- flows/updaters/update_ott_taxonomy.py | 12 +--------- flows/updaters/update_tolid_prefixes.py | 9 ------- pyproject.toml | 14 ++++++++++- 36 files changed, 34 insertions(+), 207 deletions(-) create mode 100644 flows/feature_parsers/__init__.py create mode 100644 flows/lib/__init__.py create mode 100644 flows/parsers/__init__.py create mode 100644 flows/updaters/__init__.py create mode 100644 flows/updaters/api/__init__.py diff --git a/flows/README.md b/flows/README.md index e8a4baf..de7354c 100644 --- a/flows/README.md +++ b/flows/README.md @@ -2,7 +2,6 @@ A collection of prefect flows for processing and importing data into a GenomeHubs index. - # Initial setup ## Install prefect @@ -85,7 +84,7 @@ prefect --no-prompt deploy --prefect-file flows/prefect.yaml --all # Local development -All example commands below assume dependencies have been installed as described in [install dependencies](#install-dependencies) above. For local development, the [install prefect](#install-prefect) and [deploy flows](#deploy-flows) steps are not needed. All flows can be run with a `SKIP_PREFECT` environment variable to run without a Prefect API connection. +All example commands below assume dependencies have been installed as described in [install dependencies](#install-dependencies) above. For local development, the [install prefect](#install-prefect) and [deploy flows](#deploy-flows) steps are not needed. All flows can be run with a `SKIP_PREFECT` environment variable to run without a Prefect API connection. When writing new flows and tasks, please follow the established conventions, referring to existing files as examples. The import sections need to handle running with and without prefect so will typically make use of `flows/lib/conditional_import.py` and command line arguments should be standardised across all flows by importing argument definitions from `flows/lib/shared_args.py`. @@ -100,7 +99,7 @@ Updaters are flows used to update the local copy of data from a remote resource, The `update_ncbi_datasets.py` updater runs the [ncbi datasets] tool for a given root taxon ID to return a JSONL file with one line per assembly. It will optionally compare the number of lines in the fetched file to a previous version (stored in an s3 bucket) to determine whether there are additional records available. The flow emits an event on completion that can be used to trigger related flows. ``` -SKIP_PREFECT=true python3 flows/updaters/update_ncbi_datasets.py -r 9608 -o /tmp/assembly-data/ncbi_datasets_canidae.jsonl -s s3://goat/resources/assembly-data/ncbi_datasets_canidae.jsonl +SKIP_PREFECT=true python3 -m flows.updaters.update_ncbi_datasets -r 9608 -o /tmp/assembly-data/ncbi_datasets_canidae.jsonl -s s3://goat/resources/assembly-data/ncbi_datasets_canidae.jsonl ``` ## Fetch parse validate @@ -116,7 +115,7 @@ The flow at `flows/lib/fetch_previous_file_pair.py` is used to fetch a YAML/TSV This example command assumes the [genomehubs/goat-data](https://github.com/genomehubs/goat-data) repository is available in a sibling directory. ``` -SKIP_PREFECT=true python3 flows/lib/fetch_previous_file_pair.py -y ../goat-data/sources/assembly-data/ncbi_datasets_eukaryota.types.yaml -s s3://goat/sources/assembly-data -w /tmp/assembly-data +SKIP_PREFECT=true python3 -m flows.lib.fetch_previous_file_pair -y ../goat-data/sources/assembly-data/ncbi_datasets_eukaryota.types.yaml -s s3://goat/sources/assembly-data -w /tmp/assembly-data ``` ### `flows/parsers` @@ -130,7 +129,7 @@ The `parse_ncbi_assemblies.py` parser takes an NCBI datasets JSONL file as input This example command assumes the [genomehubs/goat-data](https://github.com/genomehubs/goat-data) repository is available in a sibling directory. ``` -SKIP_PREFECT=true python3 flows/parsers/parse_ncbi_assemblies.py -i /tmp/assembly-data/ncbi_datasets_canidae.jsonl -y /tmp/assembly-data/ncbi_datasets_eukaryota.types.yaml -a +SKIP_PREFECT=true python3 -m flows.parsers.parse_ncbi_assemblies -i /tmp/assembly-data/ncbi_datasets_canidae.jsonl -y /tmp/assembly-data/ncbi_datasets_eukaryota.types.yaml -a ``` ### Validate @@ -142,7 +141,7 @@ The `blobtk validate` command is still experimental and has not been tested on t #### Example ``` -SKIP_PREFECT=true python3 flows/lib/validate_file_pair.py -y /tmp/assembly-data/ncbi_datasets_eukaryota.types.yaml -w /tmp/assembly-data +SKIP_PREFECT=true python3 -m flows.lib.validate_file_pair -y /tmp/assembly-data/ncbi_datasets_eukaryota.types.yaml -w /tmp/assembly-data ``` diff --git a/flows/feature_parsers/__init__.py b/flows/feature_parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flows/feature_parsers/args.py b/flows/feature_parsers/args.py index aff55a5..659b5c0 100644 --- a/flows/feature_parsers/args.py +++ b/flows/feature_parsers/args.py @@ -1,13 +1,4 @@ -#!/usr/bin/env python3 - -# sourcery skip: avoid-builtin-shadow import argparse -import sys -from os.path import abspath, dirname - -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" from flows.lib.shared_args import INPUT_PATH, YAML_PATH from flows.lib.shared_args import parse_args as _parse_args diff --git a/flows/feature_parsers/parse_blobtoolkit_assembly.py b/flows/feature_parsers/parse_blobtoolkit_assembly.py index dfd3335..cf87535 100644 --- a/flows/feature_parsers/parse_blobtoolkit_assembly.py +++ b/flows/feature_parsers/parse_blobtoolkit_assembly.py @@ -1,18 +1,6 @@ -#!/usr/bin/env python3 - -# sourcery skip: avoid-builtin-shadow import os -import sys from glob import glob -from os.path import abspath, dirname - -# from genomehubs import utils as gh_utils -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" - -# from flows.lib import utils # noqa: E402 from flows.lib.conditional_import import flow # noqa: E402 from flows.lib.utils import Parser # noqa: E402 from flows.parsers.args import parse_args # noqa: E402 @@ -76,3 +64,4 @@ def plugin(): """Run the flow.""" args = parse_args("Parse assembly data from a BlobToolKit BlobDir.") parse_blobtoolkit_assembly_data(**vars(args)) + parse_blobtoolkit_assembly_data(**vars(args)) diff --git a/flows/feature_parsers/parse_busco_features.py b/flows/feature_parsers/parse_busco_features.py index 8d5a04c..342862a 100644 --- a/flows/feature_parsers/parse_busco_features.py +++ b/flows/feature_parsers/parse_busco_features.py @@ -1,18 +1,6 @@ -#!/usr/bin/env python3 - -# sourcery skip: avoid-builtin-shadow import os -import sys from glob import glob -from os.path import abspath, dirname - -# from genomehubs import utils as gh_utils - -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" -# from flows.lib import utils # noqa: E402 from flows.lib.conditional_import import flow # noqa: E402 from flows.lib.utils import Parser # noqa: E402 from flows.parsers.args import parse_args # noqa: E402 diff --git a/flows/feature_parsers/register.py b/flows/feature_parsers/register.py index e3c421b..beb9332 100644 --- a/flows/feature_parsers/register.py +++ b/flows/feature_parsers/register.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import importlib.util import os from enum import Enum, auto diff --git a/flows/lib/__init__.py b/flows/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flows/lib/conditional_import.py b/flows/lib/conditional_import.py index 82dc6b1..ada08c4 100644 --- a/flows/lib/conditional_import.py +++ b/flows/lib/conditional_import.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import os diff --git a/flows/lib/fetch_genomehubs_target_list.py b/flows/lib/fetch_genomehubs_target_list.py index 70fb62a..131345e 100644 --- a/flows/lib/fetch_genomehubs_target_list.py +++ b/flows/lib/fetch_genomehubs_target_list.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import os from urllib.parse import urlencode diff --git a/flows/lib/fetch_previous_file_pair.py b/flows/lib/fetch_previous_file_pair.py index 91e7794..db48dbf 100644 --- a/flows/lib/fetch_previous_file_pair.py +++ b/flows/lib/fetch_previous_file_pair.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import gzip import os import shutil diff --git a/flows/lib/for_each_record.py b/flows/lib/for_each_record.py index e25a6f2..8d64134 100644 --- a/flows/lib/for_each_record.py +++ b/flows/lib/for_each_record.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - from typing import Generator from conditional_import import flow diff --git a/flows/lib/index_assembly_features.py b/flows/lib/index_assembly_features.py index 65c59e0..eb1bfb8 100644 --- a/flows/lib/index_assembly_features.py +++ b/flows/lib/index_assembly_features.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import os from urllib.parse import urlencode diff --git a/flows/lib/process_features.py b/flows/lib/process_features.py index 4f2e142..1ed2239 100644 --- a/flows/lib/process_features.py +++ b/flows/lib/process_features.py @@ -1,14 +1,8 @@ -import sys from enum import Enum -from os.path import abspath, dirname from shared_args import WORK_DIR, YAML_PATH, parse_args, required from utils import enum_action -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" - from flows.feature_parsers.register import register_plugins # noqa: E402 PARSERS = register_plugins() diff --git a/flows/lib/shared_args.py b/flows/lib/shared_args.py index 426952f..f7d4489 100644 --- a/flows/lib/shared_args.py +++ b/flows/lib/shared_args.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ Arguments shared between scripts. diff --git a/flows/lib/shared_tasks.py b/flows/lib/shared_tasks.py index af0e4bb..f0f0b60 100644 --- a/flows/lib/shared_tasks.py +++ b/flows/lib/shared_tasks.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import os from conditional_import import NO_CACHE, task diff --git a/flows/lib/utils.py b/flows/lib/utils.py index 8f566c6..946b385 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -606,7 +606,10 @@ def find_s3_file(s3_path: list, filename: str) -> str: def is_safe_path(path: str) -> bool: # Only allow alphanumeric, dash, underscore, dot, slash, and colon (for s3) - return bool(re.match(r"^[\w\-/.:]+$", path)) + # Disallow '..', absolute paths, and directory traversal + if not re.match(r"^[\w\-/.:]+$", path): + return False + return ".." not in path and not path.startswith("/") and not path.startswith("~") def parse_s3_path(s3_path): @@ -680,7 +683,6 @@ def upload_to_s3(local_path: str, s3_path: str, gz: bool = False) -> None: cmd = [ "s3cmd", "put", - "setacl", "--acl-public", gz_path, s3_path, @@ -692,7 +694,6 @@ def upload_to_s3(local_path: str, s3_path: str, gz: bool = False) -> None: cmd = [ "s3cmd", "put", - "setacl", "--acl-public", local_path, s3_path, @@ -755,7 +756,7 @@ def parse_tsv(text: str) -> List[Dict[str, str]]: return list(reader) -def last_modified_git_remote(http_path: str) -> Optional[str]: +def last_modified_git_remote(http_path: str) -> Optional[int]: """ Get the last modified date of a file in a git repository. @@ -763,7 +764,7 @@ def last_modified_git_remote(http_path: str) -> Optional[str]: http_path (str): Path to the HTTP file. Returns: - str: Last modified date of the file. + Optional[int]: Last modified date of the file, or None if not found. """ try: if not http_path.startswith("https://gitlab.com/"): @@ -791,13 +792,16 @@ def last_modified_git_remote(http_path: str) -> Optional[str]: else: response = requests.head(http_path, allow_redirects=True, timeout=300) if response.status_code == 200: - return response.headers.get("Last-Modified", None) + if last_modified := response.headers.get("Last-Modified", None): + dt = parser.parse(last_modified) + return int(dt.timestamp()) + return None except Exception as e: print(f"Error parsing GitLab URL or fetching commit info: {e}") return None -def last_modified_http(http_path: str) -> Optional[str]: +def last_modified_http(http_path: str) -> Optional[int]: """ Get the last modified date of a file. @@ -805,7 +809,7 @@ def last_modified_http(http_path: str) -> Optional[str]: http_path (str): Path to the HTTP file. Returns: - str: Last modified date of the file. + Optional[int]: Last modified date of the file, or None if not found. """ if "gitlab.com" in http_path: return last_modified_git_remote(http_path) @@ -818,7 +822,7 @@ def last_modified_http(http_path: str) -> Optional[str]: return None -def last_modified_s3(s3_path: str) -> Optional[str]: +def last_modified_s3(s3_path: str) -> Optional[int]: """ Get the last modified date of a file on S3. @@ -826,7 +830,7 @@ def last_modified_s3(s3_path: str) -> Optional[str]: s3_path (str): Path to the remote file on s3. Returns: - str: Last modified date of the file. + Optional[int]: Last modified date of the file, or None if not found. """ s3 = boto3.client("s3") diff --git a/flows/lib/validate_file_pair.py b/flows/lib/validate_file_pair.py index 741c59b..ab55f8a 100644 --- a/flows/lib/validate_file_pair.py +++ b/flows/lib/validate_file_pair.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import json import os import shutil diff --git a/flows/lib/wrapper_fetch_parse_validate.py b/flows/lib/wrapper_fetch_parse_validate.py index 68253d1..522f23a 100644 --- a/flows/lib/wrapper_fetch_parse_validate.py +++ b/flows/lib/wrapper_fetch_parse_validate.py @@ -1,10 +1,5 @@ -#!/usr/bin/env python3 - -# sourcery skip: avoid-builtin-shadow import os -import sys from enum import Enum -from os.path import abspath, dirname from typing import Optional from conditional_import import flow @@ -24,10 +19,6 @@ from utils import enum_action from validate_file_pair import validate_file_pair -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" - from flows.parsers.register import register_plugins # noqa: E402 PARSERS = register_plugins() diff --git a/flows/parsers/__init__.py b/flows/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flows/parsers/args.py b/flows/parsers/args.py index fec9845..a848e9b 100644 --- a/flows/parsers/args.py +++ b/flows/parsers/args.py @@ -1,13 +1,4 @@ -#!/usr/bin/env python3 - -# sourcery skip: avoid-builtin-shadow import argparse -import sys -from os.path import abspath, dirname - -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" from flows.lib.shared_args import APPEND, INPUT_PATH, YAML_PATH from flows.lib.shared_args import parse_args as _parse_args diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index f2e2864..fb489bb 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -1,21 +1,12 @@ -#!/usr/bin/env python3 - -# sourcery skip: avoid-builtin-shadow import json import os import subprocess -import sys from collections import defaultdict from glob import glob -from os.path import abspath, dirname from typing import Generator, Optional from genomehubs import utils as gh_utils -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" - from flows.lib import utils # noqa: E402 from flows.lib.conditional_import import flow, run_count, task # noqa: E402 from flows.lib.utils import Config, Parser # noqa: E402 diff --git a/flows/parsers/parse_refseq_organelles.py b/flows/parsers/parse_refseq_organelles.py index d496801..09b7d37 100644 --- a/flows/parsers/parse_refseq_organelles.py +++ b/flows/parsers/parse_refseq_organelles.py @@ -1,13 +1,3 @@ -#!/usr/bin/env python3 - -# sourcery skip: avoid-builtin-shadow -import sys -from os.path import abspath, dirname - -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" - from flows.lib.utils import Parser # noqa: E402 from flows.parsers.args import parse_args # noqa: E402 diff --git a/flows/parsers/parse_sequencing_status.py b/flows/parsers/parse_sequencing_status.py index 74df853..3be87c8 100644 --- a/flows/parsers/parse_sequencing_status.py +++ b/flows/parsers/parse_sequencing_status.py @@ -1,13 +1,3 @@ -#!/usr/bin/env python3 - -# sourcery skip: avoid-builtin-shadow -import sys -from os.path import abspath, dirname - -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" - from flows.lib.utils import Parser # noqa: E402 from flows.parsers.args import parse_args # noqa: E402 diff --git a/flows/parsers/parse_skip_parsing.py b/flows/parsers/parse_skip_parsing.py index 9c92443..11e21c2 100644 --- a/flows/parsers/parse_skip_parsing.py +++ b/flows/parsers/parse_skip_parsing.py @@ -1,13 +1,4 @@ -#!/usr/bin/env python3 - -# sourcery skip: avoid-builtin-shadow import os -import sys -from os.path import abspath, dirname - -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" from flows.lib.conditional_import import flow, task # noqa: E402 from flows.lib.shared_tasks import get_filenames # noqa: E402 diff --git a/flows/parsers/register.py b/flows/parsers/register.py index 2a714ab..940b8d4 100644 --- a/flows/parsers/register.py +++ b/flows/parsers/register.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python3 - import importlib.util import os from enum import Enum, auto diff --git a/flows/updaters/__init__.py b/flows/updaters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flows/updaters/api/__init__.py b/flows/updaters/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/flows/updaters/update_boat_config.py b/flows/updaters/update_boat_config.py index a1555fd..e0407d1 100644 --- a/flows/updaters/update_boat_config.py +++ b/flows/updaters/update_boat_config.py @@ -1,21 +1,11 @@ -#!/usr/bin/env python3 - -# sourcery skip: avoid-builtin-shadow import hashlib import os import subprocess -import sys -from os.path import abspath, dirname import boto3 import requests from botocore.exceptions import ClientError -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - __package__ = "flows" - -# from flows.lib.conditional_import import emit_event, flow, task from flows.lib.conditional_import import flow, task from flows.lib.shared_args import ( APPEND, diff --git a/flows/updaters/update_ena_taxonomy_extra.py b/flows/updaters/update_ena_taxonomy_extra.py index 061dde1..90e680c 100644 --- a/flows/updaters/update_ena_taxonomy_extra.py +++ b/flows/updaters/update_ena_taxonomy_extra.py @@ -1,18 +1,9 @@ -#!/usr/bin/env python3 - import json import os -import sys -from os.path import abspath, dirname from urllib.request import urlopen from tqdm import tqdm -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - # sourcery skip: avoid-builtin-shadow - __package__ = "flows" - from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import ( APPEND, diff --git a/flows/updaters/update_genomehubs_taxonomy.py b/flows/updaters/update_genomehubs_taxonomy.py index 130a408..e104c6c 100644 --- a/flows/updaters/update_genomehubs_taxonomy.py +++ b/flows/updaters/update_genomehubs_taxonomy.py @@ -1,18 +1,9 @@ -#!/usr/bin/env python3 - import os import subprocess -import sys from collections import defaultdict -from os.path import abspath, dirname import yaml -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - # sourcery skip: avoid-builtin-shadow - __package__ = "flows" - from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import ( INPUT_PATH, diff --git a/flows/updaters/update_ncbi_datasets.py b/flows/updaters/update_ncbi_datasets.py index 989af38..056956d 100644 --- a/flows/updaters/update_ncbi_datasets.py +++ b/flows/updaters/update_ncbi_datasets.py @@ -1,19 +1,10 @@ -#!/usr/bin/env python3 - import hashlib import os import subprocess -import sys -from os.path import abspath, dirname import boto3 from botocore.exceptions import ClientError -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - # sourcery skip: avoid-builtin-shadow - __package__ = "flows" - from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import ( OUTPUT_PATH, diff --git a/flows/updaters/update_ncbi_taxonomy.py b/flows/updaters/update_ncbi_taxonomy.py index f10b589..6027504 100644 --- a/flows/updaters/update_ncbi_taxonomy.py +++ b/flows/updaters/update_ncbi_taxonomy.py @@ -1,14 +1,5 @@ -#!/usr/bin/env python3 - import os import subprocess -import sys -from os.path import abspath, dirname - -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - # sourcery skip: avoid-builtin-shadow - __package__ = "flows" from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import OUTPUT_PATH, parse_args, required diff --git a/flows/updaters/update_nhm.py b/flows/updaters/update_nhm.py index de2275e..6abd80a 100644 --- a/flows/updaters/update_nhm.py +++ b/flows/updaters/update_nhm.py @@ -1,13 +1,3 @@ -#!/usr/bin/env python3 - -import sys -from os.path import abspath, dirname - -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - # sourcery skip: avoid-builtin-shadow - __package__ = "flows" - from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import MIN_RECORDS, OUTPUT_PATH, parse_args, required from flows.updaters.api import api_config as cfg diff --git a/flows/updaters/update_ott_taxonomy.py b/flows/updaters/update_ott_taxonomy.py index f5056e0..b5abd13 100644 --- a/flows/updaters/update_ott_taxonomy.py +++ b/flows/updaters/update_ott_taxonomy.py @@ -1,16 +1,6 @@ -#!/usr/bin/env python3 - +import json import os import subprocess -import sys -from os.path import abspath, dirname - -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - # sourcery skip: avoid-builtin-shadow - __package__ = "flows" - -import json from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import OUTPUT_PATH, parse_args, required diff --git a/flows/updaters/update_tolid_prefixes.py b/flows/updaters/update_tolid_prefixes.py index 51ad9ab..c37d09a 100644 --- a/flows/updaters/update_tolid_prefixes.py +++ b/flows/updaters/update_tolid_prefixes.py @@ -1,14 +1,5 @@ -#!/usr/bin/env python3 - import os import subprocess -import sys -from os.path import abspath, dirname - -if __name__ == "__main__" and __package__ is None: - sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) - # sourcery skip: avoid-builtin-shadow - __package__ = "flows" from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import OUTPUT_PATH, parse_args, required diff --git a/pyproject.toml b/pyproject.toml index 11de11b..3d40f11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,4 +10,16 @@ exclude = ''' | build | dist )/ -''' \ No newline at end of file +''' + +[tool.isort] +profile = "black" +line_length = 88 # Match black's default +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +use_parentheses = true +ensure_newline_before_comments = true + +[tool.flake8] +max-line-length = 88 \ No newline at end of file From 2c01fbbea0dd03a2e380cbf26c3a5f0bff42903c Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 09:15:09 +0100 Subject: [PATCH 35/44] add comments to confirm input validation --- flows/lib/utils.py | 2 ++ flows/lib/validate_file_pair.py | 1 + flows/updaters/update_genomehubs_taxonomy.py | 1 + flows/updaters/update_ncbi_taxonomy.py | 3 +++ flows/updaters/update_ott_taxonomy.py | 3 +++ flows/updaters/update_tolid_prefixes.py | 1 + 6 files changed, 11 insertions(+) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index 946b385..b0d217e 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -687,6 +687,7 @@ def upload_to_s3(local_path: str, s3_path: str, gz: bool = False) -> None: gz_path, s3_path, ] + # Inputs have been validated by is_safe_path; safe to use in subprocess result = subprocess.run(cmd, capture_output=True, text=True) os.remove(gz_path) else: @@ -698,6 +699,7 @@ def upload_to_s3(local_path: str, s3_path: str, gz: bool = False) -> None: local_path, s3_path, ] + # Inputs have been validated by is_safe_path; safe to use in subprocess result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print( diff --git a/flows/lib/validate_file_pair.py b/flows/lib/validate_file_pair.py index ab55f8a..ad0b1ae 100644 --- a/flows/lib/validate_file_pair.py +++ b/flows/lib/validate_file_pair.py @@ -59,6 +59,7 @@ def validate_yaml_file( ) # Run the command with subprocess run and capture stdout + # Inputs have been validated by is_safe_path; safe to use in subprocess result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True) status = result.returncode == 0 output = result.stdout diff --git a/flows/updaters/update_genomehubs_taxonomy.py b/flows/updaters/update_genomehubs_taxonomy.py index e104c6c..b72d7f4 100644 --- a/flows/updaters/update_genomehubs_taxonomy.py +++ b/flows/updaters/update_genomehubs_taxonomy.py @@ -71,6 +71,7 @@ def run_blobtk_taxonomy(root_taxid: str, input_path: str, output_path: str) -> N ] print(f"Running command: {' '.join(cmd)}") try: + # Inputs have been validated by is_safe_path; safe to use in subprocess process = subprocess.Popen( cmd, stdout=subprocess.PIPE, diff --git a/flows/updaters/update_ncbi_taxonomy.py b/flows/updaters/update_ncbi_taxonomy.py index 6027504..0cc6258 100644 --- a/flows/updaters/update_ncbi_taxonomy.py +++ b/flows/updaters/update_ncbi_taxonomy.py @@ -31,12 +31,14 @@ def fetch_ncbi_taxonomy( # Fetch the remote file cmd = ["curl", "-sSL", http_path, "-o", local_gz_file] print(f"Running command: {' '.join(cmd)}") + # Inputs have been validated by is_safe_path; safe to use in subprocess subprocess.run(cmd, check=True) remote_md5_path = f"{http_path}.md5" # Fetch the remote MD5 checksum cmd = ["curl", "-sSL", remote_md5_path] print(f"Running command: {' '.join(cmd)}") + # Inputs have been validated by is_safe_path; safe to use in subprocess result = subprocess.run(cmd, check=True, capture_output=True, text=True) remote_md5 = result.stdout.split()[0] @@ -51,6 +53,7 @@ def fetch_ncbi_taxonomy( # extract the tar.gz file cmd = ["tar", "-xzf", local_gz_file, "-C", local_path] print(f"Running command: {' '.join(cmd)}") + # Inputs have been validated by is_safe_path; safe to use in subprocess subprocess.run(cmd, check=True) # set the timestamp of extracted files to match the tar.gz file diff --git a/flows/updaters/update_ott_taxonomy.py b/flows/updaters/update_ott_taxonomy.py index b5abd13..b6b8671 100644 --- a/flows/updaters/update_ott_taxonomy.py +++ b/flows/updaters/update_ott_taxonomy.py @@ -32,11 +32,13 @@ def fetch_ott_taxonomy( # Fetch the remote file cmd = ["curl", "-sSL", http_path, "-o", local_gz_file] print(f"Running command: {' '.join(cmd)}") + # Inputs have been validated by is_safe_path; safe to use in subprocess subprocess.run(cmd, check=True) # extract the tar.gz file cmd = ["tar", "-xzf", local_gz_file, "-C", local_path] print(f"Running command: {' '.join(cmd)}") + # Inputs have been validated by is_safe_path; safe to use in subprocess subprocess.run(cmd, check=True) # Find the extracted subdirectory (should start with 'ott') @@ -104,6 +106,7 @@ def set_ott_url() -> str: "https://api.opentreeoflife.org/v3/taxonomy/about", ] print(f"Running command: {' '.join(cmd)}") + # Input is generated from static strings; safe to use in subprocess result = subprocess.run(cmd, check=True, capture_output=True, text=True) ott_json = json.loads(result.stdout) diff --git a/flows/updaters/update_tolid_prefixes.py b/flows/updaters/update_tolid_prefixes.py index c37d09a..dc52f92 100644 --- a/flows/updaters/update_tolid_prefixes.py +++ b/flows/updaters/update_tolid_prefixes.py @@ -37,6 +37,7 @@ def fetch_tolid_prefixes( # Fetch the remote file cmd = ["curl", "-sSL", http_path, "-o", local_file] print(f"Running command: {' '.join(cmd)}") + # Inputs have been validated by is_safe_path; safe to use in subprocess subprocess.run(cmd, check=True) # check the number of lines in the file From eb5874198f6ecb3f69b844c3df360265485337f7 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 09:52:33 +0100 Subject: [PATCH 36/44] update safe path check --- flows/lib/utils.py | 11 ++++++----- flows/prefect.yaml | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index b0d217e..b3c2122 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -605,11 +605,12 @@ def find_s3_file(s3_path: list, filename: str) -> str: def is_safe_path(path: str) -> bool: - # Only allow alphanumeric, dash, underscore, dot, slash, and colon (for s3) - # Disallow '..', absolute paths, and directory traversal - if not re.match(r"^[\w\-/.:]+$", path): - return False - return ".." not in path and not path.startswith("/") and not path.startswith("~") + # Only allow alphanumeric, dash, underscore, dot, slash, colon (for s3), tilde, + # and absolute paths. + # Tilde (~) and absolute paths are allowed because this function is only used + # with trusted internal input. + # Directory traversal ('..') is still blocked. + return ".." not in path if re.match(r"^[\w\-/.:~]+$", path) else False def parse_s3_path(s3_path): diff --git a/flows/prefect.yaml b/flows/prefect.yaml index 138e74c..8ce5abe 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -17,7 +17,7 @@ pull: # Pull the genomehubs/data repository to get the latest flows id: clone-data repository: https://github.com/genomehubs/data.git - branch: main + branch: feature/pipeline-updates definitions: work_pools: From ee970572346b6beab15e93a93f65b268a81a3699 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 10:00:50 +0100 Subject: [PATCH 37/44] fix parameter mismatch --- flows/prefect.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index 8ce5abe..bfefbbd 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -164,6 +164,6 @@ deployments: parameters: root_taxid: "2759" input_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota-taxonomy-input.yaml" - output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/genomehubs-taxonomy/eukaryota.jsonl" - s3_path: "s3://goat/resources/taxonomy/genomehubs/genomehubs-taxonomy/eukaryota.jsonl.gz" + output_path: "/home/ubuntu/tmp/test/genomehubs-taxonomy/eukaryota/nodes.jsonl" + s3_path: "s3://goat/resources/taxonomy/genomehubs/eukaryota/nodes.jsonl.gz" work_pool: *goat_data_work_pool From 51fc25dcc6392d9fb012c08c5117f7ef771a5c57 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 11:06:16 +0100 Subject: [PATCH 38/44] set default timeout for all requests --- .github/workflows/flake8.yml | 2 +- flows/lib/fetch_genomehubs_target_list.py | 8 ++++---- flows/lib/index_assembly_features.py | 13 ++++++------- flows/lib/utils.py | 12 ++++++++---- flows/updaters/api/api_config.py | 20 ++++++++++---------- flows/updaters/update_boat_config.py | 7 +++---- 6 files changed, 32 insertions(+), 30 deletions(-) diff --git a/.github/workflows/flake8.yml b/.github/workflows/flake8.yml index 528987c..8454a22 100644 --- a/.github/workflows/flake8.yml +++ b/.github/workflows/flake8.yml @@ -20,6 +20,6 @@ jobs: with: ignore: E203,E701,W503,W504,BLK100 max_line_length: 88 - path: src + path: flows plugins: flake8-black flake8-isort flake8-quotes error_classes: E,H,I00,Q00 diff --git a/flows/lib/fetch_genomehubs_target_list.py b/flows/lib/fetch_genomehubs_target_list.py index 131345e..c12d978 100644 --- a/flows/lib/fetch_genomehubs_target_list.py +++ b/flows/lib/fetch_genomehubs_target_list.py @@ -1,7 +1,6 @@ import os from urllib.parse import urlencode -import requests from conditional_import import emit_event, flow, task from shared_args import ( API_URL, @@ -13,6 +12,8 @@ parse_args, ) +from flows.lib.utils import safe_get + @task() def fetch_genomehubs_list_file( @@ -50,9 +51,7 @@ def fetch_genomehubs_list_file( print(f"Fetching records from {url}") # Fetch the list of target records - response = requests.get( - url, headers={"Accept": "text/tab-separated-values"}, timeout=300 - ) + response = safe_get(url, headers={"Accept": "text/tab-separated-values"}) response.raise_for_status() records = response.text # write records to file @@ -124,3 +123,4 @@ def fetch_genomehubs_target_list( ) fetch_genomehubs_target_list(**vars(args)) + fetch_genomehubs_target_list(**vars(args)) diff --git a/flows/lib/index_assembly_features.py b/flows/lib/index_assembly_features.py index eb1bfb8..1c0951b 100644 --- a/flows/lib/index_assembly_features.py +++ b/flows/lib/index_assembly_features.py @@ -1,7 +1,6 @@ import os from urllib.parse import urlencode -import requests from conditional_import import flow, task from shared_args import ( ASSEMBLY_ID, @@ -13,7 +12,7 @@ parse_args, required, ) -from utils import find_http_file, find_s3_file, get_genomehubs_attribute_value +from utils import find_http_file, find_s3_file, get_genomehubs_attribute_value, safe_get @task() @@ -54,7 +53,7 @@ def list_busco_lineages(assembly_id: str, work_dir: str) -> list: ) url = f"{goat_api}/search?{queryString}" # Fetch the list of BUSCO lineages - response = requests.get(url, timeout=300) + response = safe_get(url) response.raise_for_status() return [ get_genomehubs_attribute_value(result, "odb10_lineage") @@ -73,9 +72,9 @@ def find_busco_files(assembly_id, busco_lineages, work_dir, http_path): for path in busco_http_path: if busco_file := find_http_file(path, f"{lineage}/full_table.tsv"): local_file = f"{busco_work_dir}/{lineage}_full_table.tsv" - requests.get(busco_file, timeout=300).content + safe_get(busco_file).content with open(local_file, "wb") as file: - file.write(requests.get(busco_file, timeout=300).content) + file.write(safe_get(busco_file).content) busco_files.append(local_file) break return busco_files @@ -85,7 +84,7 @@ def find_busco_files(assembly_id, busco_lineages, work_dir, http_path): def find_blobtoolkit_files(assembly_id, work_dir, http_path): blobtoolkit_api_url = "https://blobtoolkit.genomehubs.org/api/v1" blobtoolkit_search_url = f"{blobtoolkit_api_url}/search/{assembly_id}" - response = requests.get(blobtoolkit_search_url, timeout=300) + response = safe_get(blobtoolkit_search_url) response.raise_for_status() results = response.json() if not results: @@ -105,7 +104,7 @@ def find_blobtoolkit_files(assembly_id, work_dir, http_path): return [] # fetch the full dataset metadata blobtoolkit_metadata_url = f"{blobtoolkit_api_url}/dataset/id/{dataset_id}" - response = requests.get(blobtoolkit_metadata_url, timeout=300) + response = safe_get(blobtoolkit_metadata_url) response.raise_for_status() metadata = response.json() print(metadata) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index b3c2122..053447a 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -548,6 +548,10 @@ def __call__(self, parser, namespace, values, option_string=None): return EnumAction +def safe_get(*args, timeout=300, **kwargs): + return requests.get(*args, timeout=timeout, **kwargs) + + def find_http_file(http_path: str, filename: str) -> str: """ Find files for the record ID. @@ -559,7 +563,7 @@ def find_http_file(http_path: str, filename: str) -> str: Returns: str: Path to the file. """ - response = requests.get(f"{http_path}/{filename}", timeout=300) + response = safe_get(f"{http_path}/{filename}") return f"{http_path}/{filename}" if response.status_code == 200 else None @@ -786,14 +790,14 @@ def last_modified_git_remote(http_path: str) -> Optional[int]: f"https://gitlab.com/api/v4/projects/{project}/repository/commits" f"?ref_name={ref}&path={file}&per_page=1" ) - response = requests.get(api_url, timeout=300) + response = safe_get(api_url) if response.status_code == 200: commits = response.json() if commits and commits[0].get("committed_date"): dt = parser.isoparse(commits[0]["committed_date"]) return int(dt.timestamp()) else: - response = requests.head(http_path, allow_redirects=True, timeout=300) + response = safe_get(http_path, method="HEAD", allow_redirects=True) if response.status_code == 200: if last_modified := response.headers.get("Last-Modified", None): dt = parser.parse(last_modified) @@ -816,7 +820,7 @@ def last_modified_http(http_path: str) -> Optional[int]: """ if "gitlab.com" in http_path: return last_modified_git_remote(http_path) - response = requests.head(http_path, allow_redirects=True, timeout=300) + response = safe_get(http_path, method="HEAD", allow_redirects=True) if response.status_code == 200: if last_modified := response.headers.get("Last-Modified", None): dt = parser.parse(last_modified) diff --git a/flows/updaters/api/api_config.py b/flows/updaters/api/api_config.py index 84be72e..b8e6f30 100644 --- a/flows/updaters/api/api_config.py +++ b/flows/updaters/api/api_config.py @@ -1,8 +1,9 @@ import json -import requests import yaml +from flows.lib.utils import safe_get + ##################################################################### # VGL ##################################################################### @@ -24,7 +25,7 @@ def vgl_url_opener(**kwargs): "https://raw.githubusercontent.com/vgl-hub/genome-portal/" "master/_data/table_tracker.yml" ) - return requests.get(vgl_url, stream=True, timeout=300) + return safe_get(vgl_url, stream=True) def vgl_hub_count_handler(r_text): @@ -81,7 +82,7 @@ def vgl_row_handler(r_text, fieldnames, **kwargs): def nhm_url_opener(**kwargs): - return requests.post(nhm_url, headers=nhm_headers, json=nhm_post_data, timeout=300) + return safe_get(nhm_url, method="POST", headers=nhm_headers, json=nhm_post_data) def nhm_api_count_handler(r_text): @@ -93,8 +94,8 @@ def nhm_row_handler(fieldnames, **kwargs): nhm_post_data_after = nhm_post_data result = [] while True: - response = requests.post( - nhm_url, headers=nhm_headers, json=nhm_post_data_after, timeout=300 + response = safe_get( + nhm_url, method="POST", headers=nhm_headers, json=nhm_post_data_after ) r = response.json() dl = r["result"]["records"] @@ -168,9 +169,7 @@ def nhm_row_handler(fieldnames, **kwargs): def sts_url_opener(token): - return requests.get( - sts_url, headers={"Token": token, "Project": "ALL"}, verify=False, timeout=300 - ) + return safe_get(sts_url, headers={"Token": token, "Project": "ALL"}, verify=False) def sts_api_count_handler(r_text): @@ -186,8 +185,8 @@ def sts_row_handler(result_count, fieldnames, token, **kwargs): print(page) url = f"{sts_url}?page={page}&page_size={page_size}" - r = requests.get( - url, headers={"Token": token, "Project": "ALL"}, verify=False, timeout=300 + r = safe_get( + url, headers={"Token": token, "Project": "ALL"}, verify=False ).json() dl = r["data"]["list"] @@ -221,3 +220,4 @@ def sts_row_handler(result_count, fieldnames, token, **kwargs): result.extend(d) return result + return result diff --git a/flows/updaters/update_boat_config.py b/flows/updaters/update_boat_config.py index e0407d1..fcaacd0 100644 --- a/flows/updaters/update_boat_config.py +++ b/flows/updaters/update_boat_config.py @@ -3,7 +3,6 @@ import subprocess import boto3 -import requests from botocore.exceptions import ClientError from flows.lib.conditional_import import flow, task @@ -16,7 +15,7 @@ parse_args, required, ) -from flows.lib.utils import is_safe_path, parse_tsv +from flows.lib.utils import is_safe_path, parse_tsv, safe_get def taxon_id_to_ssh_path(ssh_host, taxon_id, assembly_name): @@ -121,7 +120,7 @@ def assembly_id_to_busco_sets(alt_host, assembly_id): busco_url = ( f"https://busco.cog.sanger.ac.uk/{assembly_id}/{lineage}/full_table.tsv" ) - response = requests.get(busco_url, timeout=300) + response = safe_get(busco_url) if response.status_code == 200: busco_sets.append(lineage) return f"https://busco.cog.sanger.ac.uk/{assembly_id}", busco_sets @@ -192,7 +191,7 @@ def fetch_goat_results(root_taxid): # fetch query_url with accept header tsv. use python module requests headers = {"Accept": "text/tab-separated-values"} - response = requests.get(query_url, headers=headers, timeout=300) + response = safe_get(query_url, headers=headers) if response.status_code != 200: raise RuntimeError( f"Error fetching BoaT config info: {response.status_code} {response.text}" From 4f18c5d133e956391de28fc643d4cc45c0829fb4 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 11:24:18 +0100 Subject: [PATCH 39/44] fix sourcery issues --- .../parse_blobtoolkit_assembly.py | 1 - flows/lib/utils.py | 74 ++++++++++++++----- flows/lib/validate_file_pair.py | 3 +- flows/parsers/parse_ncbi_assemblies.py | 2 +- flows/updaters/api/api_config.py | 1 - flows/updaters/update_boat_config.py | 9 +-- flows/updaters/update_genomehubs_taxonomy.py | 4 +- flows/updaters/update_ncbi_datasets.py | 4 +- flows/updaters/update_ncbi_taxonomy.py | 18 ++--- flows/updaters/update_nhm.py | 1 - flows/updaters/update_ott_taxonomy.py | 9 +-- flows/updaters/update_tolid_prefixes.py | 6 +- 12 files changed, 79 insertions(+), 53 deletions(-) diff --git a/flows/feature_parsers/parse_blobtoolkit_assembly.py b/flows/feature_parsers/parse_blobtoolkit_assembly.py index cf87535..eb5dd66 100644 --- a/flows/feature_parsers/parse_blobtoolkit_assembly.py +++ b/flows/feature_parsers/parse_blobtoolkit_assembly.py @@ -64,4 +64,3 @@ def plugin(): """Run the flow.""" args = parse_args("Parse assembly data from a BlobToolKit BlobDir.") parse_blobtoolkit_assembly_data(**vars(args)) - parse_blobtoolkit_assembly_data(**vars(args)) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index 053447a..e6bca9b 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -11,6 +11,7 @@ from csv import DictReader, Sniffer from datetime import datetime from io import StringIO +from shlex import shlex from typing import Dict, List, Optional import boto3 @@ -613,8 +614,25 @@ def is_safe_path(path: str) -> bool: # and absolute paths. # Tilde (~) and absolute paths are allowed because this function is only used # with trusted internal input. - # Directory traversal ('..') is still blocked. - return ".." not in path if re.match(r"^[\w\-/.:~]+$", path) else False + # Directory traversal ('..') is blocked. + # Allow URLs (e.g., http://, https://, s3://) and URL-safe characters + # URL-safe: alphanumeric, dash, underscore, dot, slash, colon, tilde, percent, + # question, ampersand, equals + url_pattern = r"^[\w]+://" + url_safe_pattern = r"^[\w\-/.:~%?&=]+$" + if re.match(url_pattern, path): + return ".." not in path and re.match(url_safe_pattern, path) + return ".." not in path if re.match(url_safe_pattern, path) else False + + +def run_quoted(cmd, **kwargs): + quoted_cmd = [shlex.quote(str(arg)) for arg in cmd] + return subprocess.run(quoted_cmd, **kwargs) + + +def popen_quoted(cmd, **kwargs): + quoted_cmd = [shlex.quote(str(arg)) for arg in cmd] + return subprocess.Popen(quoted_cmd, **kwargs) def parse_s3_path(s3_path): @@ -684,17 +702,31 @@ def upload_to_s3(local_path: str, s3_path: str, gz: bool = False) -> None: gz_path = f"{local_path}.gz" with open(local_path, "rb") as f_in, gzip.open(gz_path, "wb") as f_out: f_out.write(f_in.read()) - # use s3cmd for uploads due to issues with boto3 and large files - cmd = [ - "s3cmd", - "put", - "--acl-public", - gz_path, - s3_path, - ] - # Inputs have been validated by is_safe_path; safe to use in subprocess - result = subprocess.run(cmd, capture_output=True, text=True) - os.remove(gz_path) + try: + # use s3cmd for uploads due to issues with boto3 and large files + cmd = [ + "s3cmd", + "put", + "--acl-public", + gz_path, + s3_path, + ] + result = run_quoted( + cmd, + capture_output=True, + text=True, + ) + if result.returncode != 0: + print( + ( + f"Error uploading {local_path} to {s3_path} " + f"with s3cmd: {result.stderr}" + ) + ) + raise RuntimeError(f"s3cmd upload failed: {result.stderr}") + finally: + if os.path.exists(gz_path): + os.remove(gz_path) else: # use s3cmd for uploads due to issues with boto3 and large files cmd = [ @@ -704,13 +736,15 @@ def upload_to_s3(local_path: str, s3_path: str, gz: bool = False) -> None: local_path, s3_path, ] - # Inputs have been validated by is_safe_path; safe to use in subprocess - result = subprocess.run(cmd, capture_output=True, text=True) - if result.returncode != 0: - print( - f"Error uploading {local_path} to {s3_path} with s3cmd: {result.stderr}" - ) - raise RuntimeError(f"s3cmd upload failed: {result.stderr}") + result = run_quoted(cmd, capture_output=True, text=True) + if result.returncode != 0: + print( + ( + f"Error uploading {local_path} to {s3_path} " + f"with s3cmd: {result.stderr}" + ) + ) + raise RuntimeError(f"s3cmd upload failed: {result.stderr}") except Exception as e: print(f"Error uploading {local_path} to {s3_path}: {e}") raise e diff --git a/flows/lib/validate_file_pair.py b/flows/lib/validate_file_pair.py index ad0b1ae..23c7031 100644 --- a/flows/lib/validate_file_pair.py +++ b/flows/lib/validate_file_pair.py @@ -59,8 +59,7 @@ def validate_yaml_file( ) # Run the command with subprocess run and capture stdout - # Inputs have been validated by is_safe_path; safe to use in subprocess - result = subprocess.run(cmd, stdout=subprocess.PIPE, text=True) + result = utils.run_quoted(cmd, stdout=subprocess.PIPE, text=True) status = result.returncode == 0 output = result.stdout sys.stdout.write(output) diff --git a/flows/parsers/parse_ncbi_assemblies.py b/flows/parsers/parse_ncbi_assemblies.py index fb489bb..3efffab 100644 --- a/flows/parsers/parse_ncbi_assemblies.py +++ b/flows/parsers/parse_ncbi_assemblies.py @@ -46,7 +46,7 @@ def fetch_ncbi_datasets_sequences( """ if not utils.is_safe_path(accession): raise ValueError(f"Unsafe accession: {accession}") - result = subprocess.run( + result = utils.run_quoted( [ "datasets", "summary", diff --git a/flows/updaters/api/api_config.py b/flows/updaters/api/api_config.py index b8e6f30..333bcea 100644 --- a/flows/updaters/api/api_config.py +++ b/flows/updaters/api/api_config.py @@ -220,4 +220,3 @@ def sts_row_handler(result_count, fieldnames, token, **kwargs): result.extend(d) return result - return result diff --git a/flows/updaters/update_boat_config.py b/flows/updaters/update_boat_config.py index fcaacd0..71c571c 100644 --- a/flows/updaters/update_boat_config.py +++ b/flows/updaters/update_boat_config.py @@ -1,6 +1,5 @@ import hashlib import os -import subprocess import boto3 from botocore.exceptions import ClientError @@ -15,7 +14,7 @@ parse_args, required, ) -from flows.lib.utils import is_safe_path, parse_tsv, safe_get +from flows.lib.utils import is_safe_path, parse_tsv, run_quoted, safe_get def taxon_id_to_ssh_path(ssh_host, taxon_id, assembly_name): @@ -35,7 +34,7 @@ def taxon_id_to_ssh_path(ssh_host, taxon_id, assembly_name): f"speciesops getdir --taxon_id {taxon_id}'" ), ] - result = subprocess.run(command, capture_output=True, text=True) + result = run_quoted(command, capture_output=True, text=True) if result.returncode != 0: print( ( @@ -73,7 +72,7 @@ def lookup_buscos(ssh_host, file_path): "-c", (f"'ls -d {file_path}/*_odb*/'"), ] - result = subprocess.run(command, capture_output=True, text=True) + result = run_quoted(command, capture_output=True, text=True) if result.returncode != 0: return [] busco_dirs = [ @@ -103,7 +102,7 @@ def assembly_id_to_busco_sets(alt_host, assembly_id): "-c", f"'ls /volumes/data/by_accession/{assembly_id}'", ] - result = subprocess.run(command, capture_output=True, text=True) + result = run_quoted(command, capture_output=True, text=True) if result.returncode == 0: return f"/volumes/data/by_accession/{assembly_id}", result.stdout.splitlines() diff --git a/flows/updaters/update_genomehubs_taxonomy.py b/flows/updaters/update_genomehubs_taxonomy.py index b72d7f4..d4eb7e9 100644 --- a/flows/updaters/update_genomehubs_taxonomy.py +++ b/flows/updaters/update_genomehubs_taxonomy.py @@ -14,7 +14,7 @@ parse_args, required, ) -from flows.lib.utils import fetch_from_s3, is_safe_path, upload_to_s3 +from flows.lib.utils import fetch_from_s3, is_safe_path, popen_quoted, upload_to_s3 def get_file_paths_from_config(config: dict, file_paths: dict) -> dict: @@ -72,7 +72,7 @@ def run_blobtk_taxonomy(root_taxid: str, input_path: str, output_path: str) -> N print(f"Running command: {' '.join(cmd)}") try: # Inputs have been validated by is_safe_path; safe to use in subprocess - process = subprocess.Popen( + process = popen_quoted( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, diff --git a/flows/updaters/update_ncbi_datasets.py b/flows/updaters/update_ncbi_datasets.py index 056956d..ec7b476 100644 --- a/flows/updaters/update_ncbi_datasets.py +++ b/flows/updaters/update_ncbi_datasets.py @@ -1,6 +1,5 @@ import hashlib import os -import subprocess import boto3 from botocore.exceptions import ClientError @@ -14,6 +13,7 @@ parse_args, required, ) +from flows.lib.utils import run_quoted @task(retries=2, retry_delay_seconds=2, log_prints=True) @@ -79,7 +79,7 @@ def fetch_ncbi_datasets_summary( taxid, "--as-json-lines", ] - result = subprocess.run(command, capture_output=True, text=True) + result = run_quoted(command, capture_output=True, text=True) if result.returncode != 0: if ( "V2reportsRankType" in result.stderr diff --git a/flows/updaters/update_ncbi_taxonomy.py b/flows/updaters/update_ncbi_taxonomy.py index 0cc6258..890d11a 100644 --- a/flows/updaters/update_ncbi_taxonomy.py +++ b/flows/updaters/update_ncbi_taxonomy.py @@ -1,9 +1,13 @@ import os -import subprocess from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import OUTPUT_PATH, parse_args, required -from flows.lib.utils import generate_md5, is_local_file_current_http, is_safe_path +from flows.lib.utils import ( + generate_md5, + is_local_file_current_http, + is_safe_path, + run_quoted, +) @task(retries=2, retry_delay_seconds=2, log_prints=True) @@ -31,15 +35,13 @@ def fetch_ncbi_taxonomy( # Fetch the remote file cmd = ["curl", "-sSL", http_path, "-o", local_gz_file] print(f"Running command: {' '.join(cmd)}") - # Inputs have been validated by is_safe_path; safe to use in subprocess - subprocess.run(cmd, check=True) + run_quoted(cmd, check=True) remote_md5_path = f"{http_path}.md5" # Fetch the remote MD5 checksum cmd = ["curl", "-sSL", remote_md5_path] print(f"Running command: {' '.join(cmd)}") - # Inputs have been validated by is_safe_path; safe to use in subprocess - result = subprocess.run(cmd, check=True, capture_output=True, text=True) + result = run_quoted(cmd, check=True, capture_output=True, text=True) remote_md5 = result.stdout.split()[0] # Calculate the local MD5 checksum @@ -53,8 +55,7 @@ def fetch_ncbi_taxonomy( # extract the tar.gz file cmd = ["tar", "-xzf", local_gz_file, "-C", local_path] print(f"Running command: {' '.join(cmd)}") - # Inputs have been validated by is_safe_path; safe to use in subprocess - subprocess.run(cmd, check=True) + run_quoted(cmd, check=True) # set the timestamp of extracted files to match the tar.gz file gz_mtime = os.path.getmtime(local_gz_file) @@ -120,4 +121,3 @@ def update_ncbi_taxonomy(output_path: str) -> None: ) update_ncbi_taxonomy(**vars(args)) - update_ncbi_taxonomy(**vars(args)) diff --git a/flows/updaters/update_nhm.py b/flows/updaters/update_nhm.py index 6abd80a..b82d679 100644 --- a/flows/updaters/update_nhm.py +++ b/flows/updaters/update_nhm.py @@ -63,4 +63,3 @@ def update_nhm(output_path: str, min_records: int) -> None: ) update_nhm(**vars(args)) - update_nhm(**vars(args)) diff --git a/flows/updaters/update_ott_taxonomy.py b/flows/updaters/update_ott_taxonomy.py index b6b8671..7280a14 100644 --- a/flows/updaters/update_ott_taxonomy.py +++ b/flows/updaters/update_ott_taxonomy.py @@ -1,10 +1,9 @@ import json import os -import subprocess from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import OUTPUT_PATH, parse_args, required -from flows.lib.utils import is_local_file_current_http, is_safe_path +from flows.lib.utils import is_local_file_current_http, is_safe_path, run_quoted @task(retries=2, retry_delay_seconds=2, log_prints=True) @@ -33,13 +32,13 @@ def fetch_ott_taxonomy( cmd = ["curl", "-sSL", http_path, "-o", local_gz_file] print(f"Running command: {' '.join(cmd)}") # Inputs have been validated by is_safe_path; safe to use in subprocess - subprocess.run(cmd, check=True) + run_quoted(cmd, check=True) # extract the tar.gz file cmd = ["tar", "-xzf", local_gz_file, "-C", local_path] print(f"Running command: {' '.join(cmd)}") # Inputs have been validated by is_safe_path; safe to use in subprocess - subprocess.run(cmd, check=True) + run_quoted(cmd, check=True) # Find the extracted subdirectory (should start with 'ott') extracted_dirs = [ @@ -107,7 +106,7 @@ def set_ott_url() -> str: ] print(f"Running command: {' '.join(cmd)}") # Input is generated from static strings; safe to use in subprocess - result = subprocess.run(cmd, check=True, capture_output=True, text=True) + result = run_quoted(cmd, check=True, capture_output=True, text=True) ott_json = json.loads(result.stdout) # Extract required fields diff --git a/flows/updaters/update_tolid_prefixes.py b/flows/updaters/update_tolid_prefixes.py index dc52f92..0f46b44 100644 --- a/flows/updaters/update_tolid_prefixes.py +++ b/flows/updaters/update_tolid_prefixes.py @@ -1,9 +1,8 @@ import os -import subprocess from flows.lib.conditional_import import emit_event, flow, task from flows.lib.shared_args import OUTPUT_PATH, parse_args, required -from flows.lib.utils import is_local_file_current_http, is_safe_path +from flows.lib.utils import is_local_file_current_http, is_safe_path, run_quoted @task(retries=2, retry_delay_seconds=2, log_prints=True) @@ -37,8 +36,7 @@ def fetch_tolid_prefixes( # Fetch the remote file cmd = ["curl", "-sSL", http_path, "-o", local_file] print(f"Running command: {' '.join(cmd)}") - # Inputs have been validated by is_safe_path; safe to use in subprocess - subprocess.run(cmd, check=True) + run_quoted(cmd, check=True) # check the number of lines in the file with open(local_file, "r") as f: From 1e6daf7e4f4e4deb122eb4c6bd31e6b6a6826194 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 11:34:53 +0100 Subject: [PATCH 40/44] handle different request methods --- flows/lib/utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index e6bca9b..d8ea814 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -549,8 +549,13 @@ def __call__(self, parser, namespace, values, option_string=None): return EnumAction -def safe_get(*args, timeout=300, **kwargs): - return requests.get(*args, timeout=timeout, **kwargs) +def safe_get(*args, method="GET", timeout=300, **kwargs): + if method == "GET": + return requests.get(*args, timeout=timeout, **kwargs) + elif method == "POST": + return requests.post(*args, timeout=timeout, **kwargs) + elif method == "HEAD": + return requests.head(*args, timeout=timeout, **kwargs) def find_http_file(http_path: str, filename: str) -> str: From abf05422db6bb87d7eb81445801d6f3ddaf36f97 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 11:40:29 +0100 Subject: [PATCH 41/44] fix shlex import --- flows/lib/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flows/lib/utils.py b/flows/lib/utils.py index d8ea814..1f40f8f 100644 --- a/flows/lib/utils.py +++ b/flows/lib/utils.py @@ -5,13 +5,13 @@ import hashlib import os import re +import shlex import shutil import subprocess from argparse import Action from csv import DictReader, Sniffer from datetime import datetime from io import StringIO -from shlex import shlex from typing import Dict, List, Optional import boto3 From 7d37ddde6c76dddaddb155b43c2c514b919a5371 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 11:45:15 +0100 Subject: [PATCH 42/44] update flake8 config --- .flake8 | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..9747547 --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +inline-quotes = " +multiline-quotes = """ +docstring-quotes = """ \ No newline at end of file From fa27aacf93df5be0ef3fc2f7b8116a168cf9c1b2 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 11:47:53 +0100 Subject: [PATCH 43/44] remove unused import --- flows/lib/for_each_record.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/flows/lib/for_each_record.py b/flows/lib/for_each_record.py index 8d64134..fc03aa7 100644 --- a/flows/lib/for_each_record.py +++ b/flows/lib/for_each_record.py @@ -5,7 +5,6 @@ ID_COLUMN, INPUT_PATH, S3_PATH, - SSH_PATH, WORK_DIR, multi, parse_args, @@ -46,7 +45,6 @@ def for_each_record( required(ID_COLUMN), WORK_DIR, multi(S3_PATH), - # multi(SSH_PATH), ], "Run a flow for each record in an input file.", ) From 76fe33d2c2987fe5732a13c0cb345885e32c1413 Mon Sep 17 00:00:00 2001 From: Richard Challis Date: Wed, 17 Sep 2025 13:15:39 +0100 Subject: [PATCH 44/44] use main branch in config --- flows/prefect.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flows/prefect.yaml b/flows/prefect.yaml index bfefbbd..9a45e6d 100644 --- a/flows/prefect.yaml +++ b/flows/prefect.yaml @@ -17,7 +17,7 @@ pull: # Pull the genomehubs/data repository to get the latest flows id: clone-data repository: https://github.com/genomehubs/data.git - branch: feature/pipeline-updates + branch: main definitions: work_pools: