genomehubs · rjchallis · Sep 17, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+inline-quotes = "
+multiline-quotes = """
+docstring-quotes = """
diff --git a/.github/workflows/flake8.yml b/.github/workflows/flake8.yml
@@ -20,6 +20,6 @@ jobs:
         with:
           ignore: E203,E701,W503,W504,BLK100
           max_line_length: 88
-          path: src
+          path: flows
           plugins: flake8-black flake8-isort flake8-quotes
           error_classes: E,H,I00,Q00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -12,5 +12,8 @@
   "yaml.schemas": {
     "swaggerviewer:openapi": "file:///Users/rchallis/projects/genomehubs/genomehubs/src/genomehubs-api/src/api-v2.yaml"
   },
-  "restructuredtext.pythonRecommendation.disabled": true
+  "restructuredtext.pythonRecommendation.disabled": true,
+  "python-envs.defaultEnvManager": "ms-python.python:conda",
+  "python-envs.defaultPackageManager": "ms-python.python:conda",
+  "python-envs.pythonProjects": []
 }
diff --git a/flows/README.md b/flows/README.md
@@ -2,7 +2,6 @@
 
 A collection of prefect flows for processing and importing data into a GenomeHubs index.
 
-
 # Initial setup
 
 ## Install prefect
@@ -85,7 +84,7 @@ prefect --no-prompt deploy --prefect-file flows/prefect.yaml --all
 
 # Local development
 
-All example commands below assume dependencies have been installed as described in [install dependencies](#install-dependencies) above. For local development, the [install prefect](#install-prefect) and [deploy flows](#deploy-flows) steps are not needed. All flows can be run with a `SKIP_PREFECT` environment variable to run without a Prefect API connection. 
+All example commands below assume dependencies have been installed as described in [install dependencies](#install-dependencies) above. For local development, the [install prefect](#install-prefect) and [deploy flows](#deploy-flows) steps are not needed. All flows can be run with a `SKIP_PREFECT` environment variable to run without a Prefect API connection.
 
 When writing new flows and tasks, please follow the established conventions, referring to existing files as examples. The import sections need to handle running with and without prefect so will typically make use of `flows/lib/conditional_import.py` and command line arguments should be standardised across all flows by importing argument definitions from `flows/lib/shared_args.py`.
 
@@ -100,7 +99,7 @@ Updaters are flows used to update the local copy of data from a remote resource,
 The `update_ncbi_datasets.py` updater runs the [ncbi datasets] tool for a given root taxon ID to return a JSONL file with one line per assembly. It will optionally compare the number of lines in the fetched file to a previous version (stored in an s3 bucket) to determine whether there are additional records available. The flow emits an event on completion that can be used to trigger related flows.
 
 ```
-SKIP_PREFECT=true python3 flows/updaters/update_ncbi_datasets.py -r 9608 -o /tmp/assembly-data/ncbi_datasets_canidae.jsonl -s s3://goat/resources/assembly-data/ncbi_datasets_canidae.jsonl
+SKIP_PREFECT=true python3 -m flows.updaters.update_ncbi_datasets -r 9608 -o /tmp/assembly-data/ncbi_datasets_canidae.jsonl -s s3://goat/resources/assembly-data/ncbi_datasets_canidae.jsonl
 ```
 
 ## Fetch parse validate
@@ -116,7 +115,7 @@ The flow at `flows/lib/fetch_previous_file_pair.py` is used to fetch a YAML/TSV
 This example command assumes the [genomehubs/goat-data](https://github.com/genomehubs/goat-data) repository is available in a sibling directory.
 
 ```
-SKIP_PREFECT=true python3 flows/lib/fetch_previous_file_pair.py -y ../goat-data/sources/assembly-data/ncbi_datasets_eukaryota.types.yaml -s s3://goat/sources/assembly-data -w /tmp/assembly-data
+SKIP_PREFECT=true python3 -m flows.lib.fetch_previous_file_pair -y ../goat-data/sources/assembly-data/ncbi_datasets_eukaryota.types.yaml -s s3://goat/sources/assembly-data -w /tmp/assembly-data
 ```
 
 ### `flows/parsers`
@@ -130,7 +129,7 @@ The `parse_ncbi_assemblies.py` parser takes an NCBI datasets JSONL file as input
 This example command assumes the [genomehubs/goat-data](https://github.com/genomehubs/goat-data) repository is available in a sibling directory.
 
 ```
-SKIP_PREFECT=true python3 flows/parsers/parse_ncbi_assemblies.py -i /tmp/assembly-data/ncbi_datasets_canidae.jsonl -y /tmp/assembly-data/ncbi_datasets_eukaryota.types.yaml -a
+SKIP_PREFECT=true python3 -m flows.parsers.parse_ncbi_assemblies -i /tmp/assembly-data/ncbi_datasets_canidae.jsonl -y /tmp/assembly-data/ncbi_datasets_eukaryota.types.yaml -a
 ```
 
 ### Validate
@@ -142,7 +141,7 @@ The `blobtk validate` command is still experimental and has not been tested on t
 #### Example
 
 ```
-SKIP_PREFECT=true python3 flows/lib/validate_file_pair.py -y /tmp/assembly-data/ncbi_datasets_eukaryota.types.yaml -w /tmp/assembly-data
+SKIP_PREFECT=true python3 -m flows.lib.validate_file_pair -y /tmp/assembly-data/ncbi_datasets_eukaryota.types.yaml -w /tmp/assembly-data
 
 ```
 

diff --git a/flows/feature_parsers/__init__.py b/flows/feature_parsers/__init__.py
diff --git a/flows/feature_parsers/args.py b/flows/feature_parsers/args.py
@@ -1,13 +1,4 @@
-#!/usr/bin/env python3
-
-# sourcery skip: avoid-builtin-shadow
 import argparse
-import sys
-from os.path import abspath, dirname
-
-if __name__ == "__main__" and __package__ is None:
-    sys.path.insert(0, dirname(dirname(dirname(abspath(__file__)))))
-    __package__ = "flows"
 
 from flows.lib.shared_args import INPUT_PATH, YAML_PATH
 from flows.lib.shared_args import parse_args as _parse_args

diff --git a/flows/feature_parsers/parse_blobtoolkit_assembly.py b/flows/feature_parsers/parse_blobtoolkit_assembly.py
@@ -1,18 +1,6 @@
-#!/usr/bin/env python3
-
-# sourcery skip: avoid-builtin-shadow
 import os
-import sys
 from glob import glob
-from os.path import abspath, dirname
-
-# from genomehubs import utils as gh_utils
-
-if __name__ == "__main__" and __package__ is None:
-    sys.path.insert(0, dirname(dirname(dirname(abspath(__file__)))))
-    __package__ = "flows"
 
-# from flows.lib import utils  # noqa: E402
 from flows.lib.conditional_import import flow  # noqa: E402
 from flows.lib.utils import Parser  # noqa: E402
 from flows.parsers.args import parse_args  # noqa: E402

diff --git a/flows/feature_parsers/parse_busco_features.py b/flows/feature_parsers/parse_busco_features.py
@@ -1,18 +1,6 @@
-#!/usr/bin/env python3
-
-# sourcery skip: avoid-builtin-shadow
 import os
-import sys
 from glob import glob
-from os.path import abspath, dirname
-
-# from genomehubs import utils as gh_utils
-
-if __name__ == "__main__" and __package__ is None:
-    sys.path.insert(0, dirname(dirname(dirname(abspath(__file__)))))
-    __package__ = "flows"
 
-# from flows.lib import utils  # noqa: E402
 from flows.lib.conditional_import import flow  # noqa: E402
 from flows.lib.utils import Parser  # noqa: E402
 from flows.parsers.args import parse_args  # noqa: E402

diff --git a/flows/feature_parsers/register.py b/flows/feature_parsers/register.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import importlib.util
 import os
 from enum import Enum, auto

diff --git a/flows/lib/__init__.py b/flows/lib/__init__.py
diff --git a/flows/lib/conditional_import.py b/flows/lib/conditional_import.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import os
 
 

diff --git a/flows/lib/fetch_genomehubs_target_list.py b/flows/lib/fetch_genomehubs_target_list.py
@@ -1,9 +1,6 @@
-#!/usr/bin/env python3
-
 import os
 from urllib.parse import urlencode
 
-import requests
 from conditional_import import emit_event, flow, task
 from shared_args import (
     API_URL,
@@ -15,6 +12,8 @@
     parse_args,
 )
 
+from flows.lib.utils import safe_get
+
 
 @task()
 def fetch_genomehubs_list_file(
@@ -52,7 +51,7 @@ def fetch_genomehubs_list_file(
 
     print(f"Fetching records from {url}")
     # Fetch the list of target records
-    response = requests.get(url, headers={"Accept": "text/tab-separated-values"})
+    response = safe_get(url, headers={"Accept": "text/tab-separated-values"})
     response.raise_for_status()
     records = response.text
     # write records to file
@@ -124,3 +123,4 @@ def fetch_genomehubs_target_list(
     )
 
     fetch_genomehubs_target_list(**vars(args))
+    fetch_genomehubs_target_list(**vars(args))
diff --git a/flows/lib/fetch_previous_file_pair.py b/flows/lib/fetch_previous_file_pair.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import gzip
 import os
 import shutil

diff --git a/flows/lib/for_each_record.py b/flows/lib/for_each_record.py
@@ -1,13 +1,10 @@
-#!/usr/bin/env python3
-
 from typing import Generator
 
 from conditional_import import flow
 from shared_args import (
     ID_COLUMN,
     INPUT_PATH,
     S3_PATH,
-    SSH_PATH,
     WORK_DIR,
     multi,
     parse_args,
@@ -48,7 +45,6 @@ def for_each_record(
             required(ID_COLUMN),
             WORK_DIR,
             multi(S3_PATH),
-            # multi(SSH_PATH),
         ],
         "Run a flow for each record in an input file.",
     )

diff --git a/flows/lib/index_assembly_features.py b/flows/lib/index_assembly_features.py
@@ -1,9 +1,6 @@
-#!/usr/bin/env python3
-
 import os
 from urllib.parse import urlencode
 
-import requests
 from conditional_import import flow, task
 from shared_args import (
     ASSEMBLY_ID,
@@ -15,7 +12,7 @@
     parse_args,
     required,
 )
-from utils import find_http_file, find_s3_file, get_genomehubs_attribute_value
+from utils import find_http_file, find_s3_file, get_genomehubs_attribute_value, safe_get
 
 
 @task()
@@ -56,7 +53,7 @@ def list_busco_lineages(assembly_id: str, work_dir: str) -> list:
     )
     url = f"{goat_api}/search?{queryString}"
     # Fetch the list of BUSCO lineages
-    response = requests.get(url)
+    response = safe_get(url)
     response.raise_for_status()
     return [
         get_genomehubs_attribute_value(result, "odb10_lineage")
@@ -75,9 +72,9 @@ def find_busco_files(assembly_id, busco_lineages, work_dir, http_path):
         for path in busco_http_path:
             if busco_file := find_http_file(path, f"{lineage}/full_table.tsv"):
                 local_file = f"{busco_work_dir}/{lineage}_full_table.tsv"
-                requests.get(busco_file).content
+                safe_get(busco_file).content
                 with open(local_file, "wb") as file:
-                    file.write(requests.get(busco_file).content)
+                    file.write(safe_get(busco_file).content)
                 busco_files.append(local_file)
                 break
     return busco_files
@@ -87,7 +84,7 @@ def find_busco_files(assembly_id, busco_lineages, work_dir, http_path):
 def find_blobtoolkit_files(assembly_id, work_dir, http_path):
     blobtoolkit_api_url = "https://blobtoolkit.genomehubs.org/api/v1"
     blobtoolkit_search_url = f"{blobtoolkit_api_url}/search/{assembly_id}"
-    response = requests.get(blobtoolkit_search_url)
+    response = safe_get(blobtoolkit_search_url)
     response.raise_for_status()
     results = response.json()
     if not results:
@@ -107,7 +104,7 @@ def find_blobtoolkit_files(assembly_id, work_dir, http_path):
         return []
     # fetch the full dataset metadata
     blobtoolkit_metadata_url = f"{blobtoolkit_api_url}/dataset/id/{dataset_id}"
-    response = requests.get(blobtoolkit_metadata_url)
+    response = safe_get(blobtoolkit_metadata_url)
     response.raise_for_status()
     metadata = response.json()
     print(metadata)
@@ -132,8 +129,8 @@ def index_assembly_features(
     # if snapshot_exists(s3_path, assembly_id, "feature"):
     #     return taxon_id
     # find_files(assembly_id, work_dir, s3_path)
-    busco_lineages = list_busco_lineages(assembly_id, work_dir)
-    busco_files = find_busco_files(assembly_id, busco_lineages, work_dir, http_path)
+    # busco_lineages = list_busco_lineages(assembly_id, work_dir)
+    # busco_files = find_busco_files(assembly_id, busco_lineages, work_dir, http_path)
     blobtoolkit_files = find_blobtoolkit_files(assembly_id, work_dir, http_path)
     print(blobtoolkit_files)
 

diff --git a/flows/lib/process_features.py b/flows/lib/process_features.py
@@ -1,14 +1,8 @@
-import sys
 from enum import Enum
-from os.path import abspath, dirname
 
 from shared_args import WORK_DIR, YAML_PATH, parse_args, required
 from utils import enum_action
 
-if __name__ == "__main__" and __package__ is None:
-    sys.path.insert(0, dirname(dirname(dirname(abspath(__file__)))))
-    __package__ = "flows"
-
 from flows.feature_parsers.register import register_plugins  # noqa: E402
 
 PARSERS = register_plugins()

diff --git a/flows/lib/shared_args.py b/flows/lib/shared_args.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 """
 Arguments shared between scripts.
 

diff --git a/flows/lib/shared_tasks.py b/flows/lib/shared_tasks.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import os
 
 from conditional_import import NO_CACHE, task