From aca05379fa588c5046c83533442368940261e61d Mon Sep 17 00:00:00 2001 From: SamT123 Date: Tue, 15 Jul 2025 09:52:25 +0100 Subject: [PATCH 01/16] Ignore .vscode folder --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index efcd6d3a..12bb8e5f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ build .cache .ccls-cache __pycache__ +/.vscode From c666a9f64749142bc0251798e91a048e82fd35af Mon Sep 17 00:00:00 2001 From: SamT123 Date: Tue, 15 Jul 2025 14:07:46 +0100 Subject: [PATCH 02/16] Sarah's changes These are changes from /home/slj38/ae which had not been committed to skepner/ae. I'm committing them, then making the changes necessary to use cmaple on top --- README.md | 1 - bin/seqdb-update | 2 - cc/whocc/xlsx/sheet-extractor.cc | 7 +- proj/weekly-tree/export | 291 +++++++++++++++++++- proj/weekly-tree/export~ | 437 ++++++++++++++++++++++++++++++ proj/weekly-tree/garli-h3 | 452 +++++++++++++++++++++++++++++++ proj/weekly-tree/garli-memvar | 452 +++++++++++++++++++++++++++++++ proj/weekly-tree/init-h3 | 93 +++++++ proj/weekly-tree/submit | 2 + proj/weekly-tree/submit-memvar | 18 ++ proj/weekly-tree/submit-new | 18 ++ 11 files changed, 1764 insertions(+), 9 deletions(-) create mode 100755 proj/weekly-tree/export~ create mode 100755 proj/weekly-tree/garli-h3 create mode 100755 proj/weekly-tree/garli-memvar create mode 100755 proj/weekly-tree/init-h3 create mode 100755 proj/weekly-tree/submit-memvar create mode 100755 proj/weekly-tree/submit-new diff --git a/README.md b/README.md index 5afdaa17..a8f15e63 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,6 @@ - install homebrew https://brew.sh - brew install llvm ninja meson libomp cmake brotli zlib xz gnu-time catch2 -- pip3 install mypy ## Installing dependencies on Ubuntu diff --git a/bin/seqdb-update b/bin/seqdb-update index db670c9a..ab0fdde2 100755 --- a/bin/seqdb-update +++ b/bin/seqdb-update @@ -39,8 +39,6 @@ def main(args: argparse.Namespace): # only aligned are stored ae_backend.raw_sequence.calculate_hash(sequence) sequences_by_subtype.setdefault(sequence.type_subtype, []).append(sequence) - else: - print(f"{sequence.name:40s} {sequence.aa}") except Exception as err: print(f"> {err}: {sequence.name}\n {sequence.aa}") reader.context.messages_from_backend(sequence_messages) diff --git a/cc/whocc/xlsx/sheet-extractor.cc b/cc/whocc/xlsx/sheet-extractor.cc index 836de0fb..9b9bfeaa 100644 --- a/cc/whocc/xlsx/sheet-extractor.cc +++ b/cc/whocc/xlsx/sheet-extractor.cc @@ -96,7 +96,7 @@ static const std::regex re_VIDRL_serum_name{"^(?:[AB]/)?([A-Z][A-Z ]+)/?([0-9]+) static const std::regex re_VIDRL_serum_id{"^(" pattern_VIDRL_serum_id "|" pattern_CRICK_serum_id ")$", regex_icase}; static const std::regex re_VIDRL_serum_id_with_days{"^[AF][0-9][0-9][0-9][0-9]-[0-9]+D$", regex_icase}; -static const std::regex re_human_who_serum{R"(^\s*(.*(HUMAN|WHO|NORMAL)|GOAT|POST? VAX|VAX POOL|SERA POOL)\b)", regex_icase}; // "POST VAX": VIDRL H3 HI 2021, "HUMAN VAX": VIDRL H3 HI 2022 +static const std::regex re_human_who_serum{R"(^\s*(.*(HUMAN|WHO|NORMAL)|GOAT|POST? VAX|VAX POOL)\b)", regex_icase}; // "POST VAX": VIDRL H3 HI 2021, "HUMAN VAX": VIDRL H3 HI 2022 #pragma GCC diagnostic pop @@ -339,16 +339,13 @@ std::string ae::xlsx::v1::Extractor::titer(size_t ag_no, size_t sr_no) const void ae::xlsx::v1::Extractor::find_titers(warn_if_not_found winf) { std::vector>> rows; - AD_DEBUG("Sheet {} rows:{}", sheet().name(), sheet().number_of_rows()); + // AD_DEBUG("Sheet {}", sheet().name()); for (nrow_t row{0}; row < sheet().number_of_rows(); ++row) { auto titers = sheet().titer_range(row); - AD_DEBUG("Row:{} titers: ({} valid:{}) {}:{}", row, titers.length(), titers.valid(), titers.first, titers.second); adjust_titer_range(row, titers); - AD_DEBUG("Row:{} titers: ({} valid:{}) {}:{}", row, titers.length(), titers.valid(), titers.first, titers.second); if (titers.valid() && titers.length() > 2 && titers.first > ncol_t{0} && valid_titer_row(row, titers)) rows.emplace_back(row, std::move(titers)); } - AD_DEBUG("Rows with titers: {}", rows.size()); if (!ranges::all_of(rows, [&rows](const auto& en) { return en.second == rows[0].second; })) { fmt::memory_buffer report; // fmt::format(rows, "{}", "\n ")); diff --git a/proj/weekly-tree/export b/proj/weekly-tree/export index 580e9042..e4261c5f 100755 --- a/proj/weekly-tree/export +++ b/proj/weekly-tree/export @@ -36,7 +36,72 @@ def bvic(subtype: str, args: argparse.Namespace): # "ZAMBIA/503/2021", # potential long branch 2022-0401 # 2023-01-28 potential long branch "KRABI/FS002/2022", - ]) + "CATALONIA/NSVH151199301/2023_OR_0ACCE47F", # XXXX 2023-0421 + "SOUTH_AFRICA/R10785/2022_OR_0EB8BBEA", # XXX 2023-0515 + "SOUTH_AFRICA/R10907/2022_OR_BA5B358C", + "SOUTH_AFRICA/R09298/2022_OR_1A5C630A", + "SOUTH_AFRICA/R10914/2022_OR_ADA6D682", + "SOUTH_AFRICA/R10638/2022_OR_5EAB4CBB", + "SOUTH_AFRICA/R11310/2022_OR_12125C84", + "SOUTH_AFRICA/R11575/2022_OR_FED84F4F", + "SOUTH_AFRICA/R08092/2022_OR_93DDF2B2", + # XXX 2023-0602 + "LA_REUNION/HCL023041635201/2022_103E7754", + "ROMANIA/550950/2023_MDCK-SIAT_E0DECCD2", + "SAINT_PETERSBURG/RII-10866S/2023_OR_477275F7", + # XXX2023-0629 + "ABU_DHABI/4030/2022_AEC2DE48", + "RIO_GRANDE_DO_SUL/28773/2022_455F4FFF", + # 2023-0731 + "MALAYSIA/IMR-SARI0563/2023_25FE3E0E", + #2023-0811 Yam + "KENYA/KCH157321/2019_OR_308AC34A", + # 2023-0822 XXX + "ALAGOAS/5742/2023_OR_441065AD", + "ALAGOAS/4659/2023_OR_558AC3D6", + "PARANA/5012/2023_OR_D54659DC", + # 2023-1005 XXXX + "UNITED_KINGDOM/UO-476/2023", + # 2024-0130 + "SANTA_CATARINA/10748/2023_OR_7B192215", + # 2024-0703 + "MADRID/HULP-200063972/2024_C98ADFD1", # unknown at deletion positions 162-164 + "MADRID/HULP-200041842/2024_5D4B7141", # unknown at deletion positions 162-164 + # 2024-0718 + "AUSTRIA/MUW1681066/2024_OR_3D9C21FB", # XXXX + "ATHENS/ILI411/2024_OR_556F1400", # XXXX + # 2024-0411 + "PAIS_VASCO/1919/2024_OR_1D682639", + "GALICIA/52314290/2024_AF2005E4", + # 2025-2411 + "MADAGASCAR/3422/2024_OR_DF3D589F", + "MEXICO/2429277701/2024_OR_3BF2F243", + "GALICIA/GA-CHUAC-250/2025_OR_AC92C5F3", + # 2025-1202 + "PAKISTAN/NIC-JHL/2010_OR_EC56A7FD", + "PAKISTAN/NIC-ICT-284/2025_OR_8DAE4B6A", + "PAKISTAN/NIC-ICT-218/2025_OR_362FAB6C", + "PAKISTAN/NMC-12/2025_OR_7B217EEC", + "PAKISTAN/NIC-ICT-190/2025_OR_D4D6D601", + "ANDALUCIA/PMC-00528/2025_OR_549D6405", + # 2025-0313 + "GALICIA/52463655/2025_0971E6C6", + # 2025-0507 + "THUERINGEN/31/2025_OR_758A40BD", + "FRANCE/ARA-HCL025053223301/2025_OR_1C544F25", + "ABAY/NRL-2330/2025_C2E364F7", + "RIO_DE_JANEIRO/533/2025_OR_C04A6642", + "LA_REUNION/HCL023041635201/2022_OR_103E7754", + "ASTURIAS/232465739/2024_EA8BEFAB", + "UKRAINE/3471/2025_6E494813", + # 2025-0521 + "GALICIA/52475765/2025_F76C5888", + # 2025-0611 no exclusions + # 2025-0618 no exclusions + # 2025-0625 no exclusions + # 2025-0702 + "PHUKET/P1187/2025_5A09E7D5", + ]) .include_name(outgroup) # outgroup BRISBANE/60/2008, B/CHONGQING BANAN/1840/2017 (V1A) .remove_hash_duplicates() .replace_with_master() @@ -155,6 +220,125 @@ def h1(subtype: str, args: argparse.Namespace): "PAKISTAN/GIHSN-PB-15/2023_OR_484A375D", "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", "CATALONIA/NSVH101996495/2023_OR_2E610047", + # 2023-02-04 potential long branch + "SWITZERLAND/69094/2022_OR_B3C15C9B", + # 2023-0421 potential long branches + "CATALONIA/NSVH510598655/2022_OR_57BCC58B", + "PAKISTAN/1465/2023_OR_1B7B0A1A", + "BADEN-WURTTEMBERG/159/2022_OR_6AB83E7E", + "PARANA/44706/2021_OR_25BFEB40", + # XXX 2023-0515 + "A/PAKISTAN/GIHSN/ICT/1838/2023_0867EDC7", + "A/PAKISTAN/GIHSN/ICT/1634/2023_OR_3D55578F", + "A/PAKISTAN/GIHSN/ICT/1798/2023_OR_C271B3C9", + "A/PAKISTAN/GIHSN/ICT/1682/2023_OR_FB7A0386", + # XXX 2023-0602 + "PAKISTAN/GIHSN-ICT2035/2023_OR_4E73FFF6", + # XXX 2023-0731 + "CYCLADES/215/2022_D164055B", + #long branch & XXXX 2023-0811 + "GREECE/SRIHER-001/2022_OR_E68E256A", + "SAUDI_ARABIA/40/2020_OR_9B1E74A4", + "SAUDI_ARABIA/39/2020_OR_DAE6A100", + # 2023-0822 XXXX + "PARANA/6405/2023_OR_9ACD4010", + "BAHIA/5785/2023_OR_CBFA332A", + # 2023-0903 very long branch + "NETHERLANDS/10534/2023_OR_6221D5BF", + # 2023-09-08 long branch in tree + "BADAJOZ/18570374/2023_58158069", + # 2023-09-15 XXXX + "ALGIERS/251/2023_OR_1A27768", + # 2023-10-23 XXX & odd start for SAO_PAULO + "SAO_PAULO/CEVI_2300134/2023_OR_BB923F69", + "CAMBODIA/KVH230151/2023_OR_1F665D9C", + "CAMBODIA/AHC230229/2023_OR_3241A3FC", + # 2023-20-28 + "BULGARIA/725/2023_8A6AF67F", # nonsense 1-29 + "CAMBODIA/CCH230213/2023_OR_BE937275", # XXXX + # 2023-12-04 + "BANGKOK/P5113/2023_OR_CB7624FD", # long branch + # 2024-01-01 + "SWITZERLAND/114/2023_MDCK1_A2D22DAA", # long branch + # 2024-01-30 + "CATALONIA/NSAV198289092/2023_OR_291AD7ED" + "PARANA/13897/2023_OR_0A657176", + "BELGIUM/H00082/2023_56A80548", + "CACERES/59816382/2023_A1324011", + "SENEGAL/2098/2023_OR_2EC7BF97", + # 2024-0703 + "GALICIA/GA-CHUAC-98/2024_OR_988888E6", # 29 nuc differences, 19 nuc differences + "MEXICO/2271/2021_338E9C30", # run of differences, 35-38, 40-42 + "ALAGOAS/CEVI_2200198/2021_OR_0811FCC2", # 6 X + "MADRID/1832324191549/2024_OR_988BC424", # 9 X + "HUMAN/KENYA/5866/2023_3F1D8A0B", # 9X + # 2024-0718 + "SAO_PAULO/CEVI_24000816/2024_OR_BC9F903C", # XXXX + "SAO_PAULO/CEVI_24000889/2024_OR_C69399E2", # dels + # 2024-0727 + "LUXEMBOURG/LNS3963248/2024_04C1C479", # XXXX + "COSTA_RICA/INC-0552-822365/2024_OR_9E1DE43D", # XXXX + # 2024-08-06 + "CHANTHABURI/P2877/2024_OR_5B993EF6", # XXXXX + "SRI_LANKA/R4_BC75/2024_50C5FC20" # XXXXX + # 2024-0830 XXXX & many changes + "OHIO/251/2024_OR_30CB2DAF", + "HANOI/PF324163/2024_OR_402B095E", + "SOUTH_AFRICA/PET32788/2024_OR_47322CBB", + # 2024-0902 long branch + "YAOUNDE/24V-3413/2024_8196548B", + # 2024-1004 + "PHUKET/P3423/2024_OR_5E3C7D19", # XXXX + # 2024-1025 + "PARANA/10835/2021_OR_8025A28A", + "TOLEDO/PARANA_10835/2021_OR_8025A28A", + "ENGLAND/3640434/2024_OR_A126DDAB", + "CATALONIA/NSAV198309324/2024_OR_29D1522E", + # 2024-1126 + "OREGON/FLU-OHSU-242090049/2024_05BD35BD", + "OREGON/FLU-OHSU-242090044/2024_DEC6E084", + # 2024-1209 + "NAKHON_PHANOM/F2539/2024_OR_276AD388", + # 2025-0901 + "CAMBODIA/KPH240133/2024_OR_CD9DB776", + "ENGLAND/4820116/2024_OR_F5C11575", + # 2025-2401 + "GALICIA/GA-CHUAC-172/2024_OR_000994B0", + "ROMANIA/2920/2024_F73C0E92", + # 2025-1202 + "BALTIMORE/JH-173/2025_A2FCE291", + "CROATIA/HZJZ-8235/2025_170E7DD7", + "ENGLAND/340238/2025_OR_EA36F218", + "ENGLAND/121421/2024_OR_2094499C", + "ANDALUCIA/PMC-00542/2025_OR_BD2FE74E", + "PAKISTAN/NIC-ICT-576/2025_OR_6924EA8E", + "ANDALUCIA/PMC-00549/2025_OR_A6464885", + "CASTELLON/VAHNSI-GIHSN-02_0386/2025_OR_2E26A6D3", + # 2025-0313 + "QUEENSLAND/IN000908/2025_91CEAA08", + "FRANCE/HDF-RELAB-IPP01090/2025_OR_096FF1D6", + "CATALONIA/NSVH171785721/2025_OR_C896185E", + "KAMCHATKA/205-20V/2025_OR_68CABAFF", + # 2025-0507 + "ENGLAND/5100240/2024_OR_EB872B9D", + "CAMBODIA/CCH250029/2025_OR_A4964B9E", + "ASTURIAS/232523061/2025_58877E98", + "PAKISTAN/NIC_SD193/2025_OR_3A27DCA6", + "ESPIRITO_SANTO/475/2025_OR_4FB190FF", + "PAKISTAN/NIC_SD184/2025_OR_66AF2F25", + "PAKISTAN/NIC_JHL115/2025_OR_53A17665", + "GALICIA/22647653/2025_DE4A5E72", + "IRELAND/4321/2025_8EB775D5", + "ENGLAND/640250/2025_OR_01187600", + "PAKISTAN/NIC_NIH749/2025_OR_40914714", + "BELGIUM/S4050/2025_OR_E331B39E", + "GALICIA/GA-CHUAC-314/2025_OR_59908BA2", + "MURCIA/528/2025_OR_E7444F60", + "OREL/SWINE-RII-WD2678S/2025_OR_6C9DA738", + "OREL/SWINE-RII-WD2681S/2025_OR_B004CABD", + "VERMONT/UVM-0478/2022_OR_DFD37A87", + "VERMONT/UVM-1652/2022_OR_42E6DF65", + "FRANCE/NOR-IPP04189/2025_OR_88070E22", # caused tree failure ]) .remove_hash_duplicates() .replace_with_master() @@ -308,6 +492,111 @@ def h3(subtype: str, args: argparse.Namespace): "SOUTH_AFRICA/R05772/2022_OR_552B5F5E", "GALICIA/22034482/2022_2EE89148", "CATALONIA/NSVH560028269/2022_OR_9C189CFA", + # 2023-02-04 potential long branches + "MANITOBA/RV00728/2022_P1_5236154F", + "ONTARIO/198/2022_OR_D1EC3B40", + # 2023-04-21 potential long branches + "ROMANIA/493/2023_612606B9", + "PAKISTAN/1140/2023_OR_3D2D111B", + "CATALONIA/NSVH102045605/2023_OR_7490FFA8", + # XXX & long branch 2023-0731 + "BAHIA/PVM105707/2021_OR_BCE07E46", + "SOUTH_AFRICA/K056301/2023_30E0E639", + # XXXXX 2023-0811 CORRECTED 2023-1023 + "BAHIA/292038965/2021_70230607", + "BAHIA/292104308/2022_D4526D62", + # XXXXX 2023-0829 CORRECTED 2023-1023 + "SOUTH_AFRICA/PET28428/2023_7572E223", + "SOUTH_AFRICA/PET28193/2023_BF46BBB4", + "NORWAY/8084/2023_OR_4548A267", + # 2023-09-08 potential long branch ?why + "SOUTH_AFRICA/K056301/2023_CC1FE952" + # 2023-10-23 XXX + "PARA/CEVI_2200263/2021_OR_6D24213E", + "SAO_PAULO/CEVI_2200855/2022_OR_D4E825BF", + "SAO_PAULO/CEVI_2200871/2022_OR_280FCE18", + # 2023-10-28 + "SOUTH_AFRICA/K056301/2023_CC1FE952", # very divergent + # 2023-12-04 XXXX + "OMAN/CPHL_52323056/2023_OR_297A7E60", + "LUSAKA/1-NIC-332/2023_OR_BA018CE9", + "NORWAY/11072/2023_OR_EB1EA580", + # 2024-01-01 + "SAINT_PETERSBURG/RII-MH151473S/2023_OR_D20D279D", # XX + "SAINT_PETERSBURG/RII-4937S/2023_OR_95887A8C", # XX + "CANADA/NOVASCOTIA-32036/2022_A90EF453", # first 7 aa + "BRITISH_COLUMBIA/PHL-602/2022_OR_A1B8E845", # XX + # 2024-01-22 for 2024-01-16 file + "INDONESIA/BIOKES-IBKL1127/2023_OR_791937E7", # long branch + "SAO_PAULO/CEVI_2201055/2022_OR_BF9D3752", # XX + "ROMANIA/3227/2023_E20D207E", #XX + "SAO_PAULO/CEVI_2200903/2022_OR_BB30A339", # XX + "CACERES/50644480/2023_361FA786", # long branch + "BELGIUM/S1112/2023_OR_D7ABA060", # long branch + "BELGIUM/S0576/2023_OR_CEC5B896", # long branch + "BELGIUM/S0220/2023_OR_0B549607", # long branch + "BELGIUM/H0592/2022_OR_22BD830C", # long branch + # 2024-01-30 + "SPAIN/GA-HULA-615019/2023_B7077195", + "BELGIUM/S03941/2023_F4250630", + "NEW_YORK/392/2004_CFB2F75C", + "SPAIN/GA-HULA-615037/2023_E8183B9E", + "CACERES/59816379/2023_4D9F91F8", + "TURKEY/GIHSN_ANKARA-140111_950/2023_E0D8B88D", + # 2024-02-11 XXXX Indonesia is long branch + "KENYA/AFI-KDH-1820/2023_B3509344", + "INDONESIA/BIOKES-IDSW0063/2023_OR_5962326E", + "KENYA/AFI-ISB-1107/2023_E5F19B22", + "KENYA/AFI-MNB-1100/2023_6D42271", + # 2024-0703 + "CHIANG_RAI/P2287/2024_OR_2744AA9C", # 3 deletions + # 2024-0727 + "SASKATCHEWAN/RV03900/2024_OR_029CB62D", # 198 diffs + "WASHINGTON/S25532/2024_OR_13A107A6", # XXXX + "OREGON/S25529/2024_OR_5502D106", # XXXX + "OREGON/S25537/2023_OR_BE6502FF", # XXXX + "CAMBODIA/CCH230307/2023_7CC43803", # XXXX + "CAMBODIA/IAHC230253/2023_50A155E0", # XXXX + # 2024-08-06 + "VIRGINIA/H20-355-5372/2024_OR_53F1E92E", # XXXXX + "PANAMA/M273491/2024_OR_27E79A9B" # XXXXX + # 2024-0830 many changes + "COLORADO/150/2024_OR_2BD9D59A", + "COLORADO/150/2024_SIAT1_14B3FB98", + "MICHIGAN/113/2024_SIAT1_B0E6BEF3", + # 2024-0902 + "SASKATCHEWAN/RV03900/2024_OR_029CB62D", # long branch + "PANAMA/M273491/2024_OR_27E79A9B", # XXXX + # 2024-1004 + "CAMBODIA/NPH240499/2024_OR_F38419C8", #XXXX + # 2024-1025 + "CAMBODIA/NPH240575/2024_OR_33D3499A", + # 2024-1115 + "PARANA/44706/2021_OR_25BFEB40", + "SANTA_HELENA/PARANA_44706/2021_OR_25BFEB40", + # 2024-0111 (tree built 2024-1811) + "GREATER_ACCRA_REGION/FS-24-4832/2024_B7004939", + "GREATER_ACCRA_REGION/FS-24-4779/2024_1393BD1C", + # 2024-2611 + "QUEENSLAND/IN000643/2024_1ACA7E5A", + "QUEENSLAND/IN000245/2024_0854A342", + "WASHINGTON/S26082/2022_OR_7462BFDE", + "WASHINGTON/S26079/2022_OR_07B0CEEC", + "WASHINGTON/S26107/2023_OR_A56901B3", + "WASHINGTON/S26104/2022_OR_E80CAD50", + "WASHINGTON/S26097/2022_OR_40AE3B8C", + "WASHINGTON/S26077/2022_OR_C294BE90", + "WASHINGTON/S26098/2022_OR_1EDB1957", + "WASHINGTON/S26088/2022_OR_C6B3FAA9", + # 2024-0912 + "GALICIA/28220935/2023_C495745E", + "GALICIA/33341960/2023_0FF7D1C8", + # 2025-2401 + "TURKMENISTAN/784/2025_OR_F898A707", + # 2025-1202 + "COTE_DIVOIRE/4331/2024_OR_E0FF092D", + # 2025-1202 + "WISCONSIN/67/2005_XXMDCK2_8C52B46C", ]) .remove_hash_duplicates() .replace_with_master() diff --git a/proj/weekly-tree/export~ b/proj/weekly-tree/export~ new file mode 100755 index 00000000..b5cbf12a --- /dev/null +++ b/proj/weekly-tree/export~ @@ -0,0 +1,437 @@ + +#! /usr/bin/env python3 + +import sys, os, argparse, traceback +from pathlib import Path +sys.path[:0] = [str(Path(os.environ["AE_ROOT"], dir)) for dir in ["build", "py"]] +import ae_backend +from ae.sequences.source import fasta + +# ====================================================================== + +sSubtypeForSeqdb = {"B": "B", "BVIC": "B", "BYAM": "B", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubtypeNorm = {"BV": "BVIC", "BVIC": "BVIC", "BY": "BYAM", "BYAM": "BYAM", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": "h3"} + +def main(args: argparse.Namespace): + subtypes = [sSubtypeNorm[st] for st in (args.subtype or ["BVIC", "BYAM", "H1", "H3"])] + for subtype in subtypes: + sSubtypeToFunc[subtype](subtype=subtype, args=args) + +# ---------------------------------------------------------------------- + +def bvic(subtype: str, args: argparse.Namespace): + outgroup = "VICTORIA/830/2013" + selected = ( + ae_backend.seqdb.for_subtype(sSubtypeForSeqdb[subtype]) + .select_all() + .human() + .lineage("VICTORIA") + .exclude_with_issue() + .filter_dates(first="2016-06") + .find_masters() # to be able to use .filter below + .exclude(lambda ref: ref.aa["165-"]) # exclude 6-del and MIE/1/2019 with garbage in the middle + .exclude_name([ + "ANHUI_JINGHU/63/2019", # garbage in 406-414 leading to a long branch (2021-0722) + "LISBOA/NISU182_17-18/2018", # yamagata, long branch 2021-1117 + # "ZAMBIA/503/2021", # potential long branch 2022-0401 + # 2023-01-28 potential long branch + "KRABI/FS002/2022", + "CATALONIA/NSVH151199301/2023_OR_0ACCE47F", # XXXX 2023-0421 + "SOUTH_AFRICA/R10785/2022_OR_0EB8BBEA", # XXX 2023-0515 + "SOUTH_AFRICA/R10907/2022_OR_BA5B358C", + "SOUTH_AFRICA/R09298/2022_OR_1A5C630A", + "SOUTH_AFRICA/R10914/2022_OR_ADA6D682", + "SOUTH_AFRICA/R10638/2022_OR_5EAB4CBB", + "SOUTH_AFRICA/R11310/2022_OR_12125C84", + "SOUTH_AFRICA/R11575/2022_OR_FED84F4F", + "SOUTH_AFRICA/R08092/2022_OR_93DDF2B2", + # XXX 2023-0602 + "LA_REUNION/HCL023041635201/2022_103E7754", + "ROMANIA/550950/2023_MDCK-SIAT_E0DECCD2", + "SAINT_PETERSBURG/RII-10866S/2023_OR_477275F7", + # XXX2023-0629 + "ABU_DHABI/4030/2022_AEC2DE48", + "RIO_GRANDE_DO_SUL/28773/2022_455F4FFF", + # 2023-0731 + "MALAYSIA/IMR-SARI0563/2023_25FE3E0E", + #2023-0811 Yam + "KENYA/KCH157321/2019_OR_308AC34A", + # 2023-0822 XXX + "ALAGOAS/5742/2023_OR_441065AD", + "ALAGOAS/4659/2023_OR_558AC3D6", + "PARANA/5012/2023_OR_D54659DC", + # 2023-1005 XXXX + "UNITED_KINGDOM/UO-476/2023", + ]) + .include_name(outgroup) # outgroup BRISBANE/60/2008, B/CHONGQING BANAN/1840/2017 (V1A) + .remove_hash_duplicates() + .replace_with_master() + .check() + .sort("+date") + .move_name_to_beginning(outgroup) + ) + report(subtype, selected, expected_nuc_size=1710, limit=0) + export(subtype, selected) + +# ---------------------------------------------------------------------- + +def byam(subtype: str, args: argparse.Namespace): + outgroup = "MINNESOTA/2/2014" + selected = ( + ae_backend.seqdb.for_subtype(sSubtypeForSeqdb[subtype]) + .select_all() + .human() + .lineage("YAMAGATA") + .exclude_with_issue() + .filter_dates(first="2015") + .find_masters() # to be able to use .filter below + .include_name([outgroup]) # , "IDAHO/1/2014", "MASSACHUSETTS/7/2014"]) + .exclude_name([lin for lin in (line.strip() for line in Path("byam-exclude.txt").open()) if lin and lin[0] != "#"]) + .remove_hash_duplicates() + .replace_with_master() + .sort("+date") + .move_name_to_beginning(outgroup) + ) + report(subtype, selected, expected_nuc_size=1710, limit=0) + export(subtype, selected) + +# ---------------------------------------------------------------------- + +def h1(subtype: str, args: argparse.Namespace): + h1_include_txt = Path("h1-include.txt") + if not h1_include_txt.exists(): + h1_include_txt = Path(sys.argv[0]).parent.joinpath(h1_include_txt.name) + if not h1_include_txt.exists(): + raise RuntimeError("> cannot find {h1_include_txt}") + to_include = [lin for lin in (line.strip() for line in h1_include_txt.open()) if lin and lin[0] != "#"] + # to_include = to_include[:-500] + outgroup = "SOUTH_AUSTRALIA/30/2013" + selected = ( + ae_backend.seqdb.for_subtype(sSubtypeForSeqdb[subtype]) + .select_all() + .human() + .exclude_with_issue() + .filter_dates(first="2019") + .find_masters() # to be able to use .filter below + .include_name([ + outgroup, + "MICHIGAN/45/2015", + "BRAZIL/5026/2017", # # 183S, trying to show "the different 183P clades has S183P independently", inferred from 2018-0924-ssm tree + ] + to_include) # ~/AD/share/conf/h1.183S.2017-2018.seqids + .exclude_name([ + "SWINE/", + # 2020-08-10 Sarah: They're not sequenced by a CC or NIC, but by a company. The sequences don't have obvious errors, but are very different (obviously) from the rest. My feeling is they're either variant viruses (from swine) or a sequencing error. + "ANKARA/14015-724/2019", + "ANKARA/14017-004/2019", + "ANKARA/14015-736/2019", + # 2021-06-22 deletion at the end + # "BADEN-WURTTEMBERG/1/2020", + # long branch 2021-0622 + "DENMARK/1/2021", + "MANITOBA/2/2021", + # 2021-1117 long branches + "MECKLENBURG-VORPOMMERN/1/2021", + "MECKLENBURG-VORPOMMERN/1-A/2021", # refers to MECKLENBURG-VORPOMMERN/1/2021 + "NETHERLANDS/10370-1/2020", + "NETHERLANDS/10370-1A/2020", + "NETHERLANDS/10370-2/2020", # refers to NETHERLANDS/10370-1A/2020 + "NETHERLANDS/GENT-193/2019", + "HESSEN/47/2020", + "HUBEI WUJIAGANG/1324/2020", + "HEBEI HAIGANG/SWL1572/2019", + "SHANDONG/204/2021", + "YUNNAN MENGZI/1462/2020", + "NORTH CAROLINA/15/2020", + "WISCONSIN/3/2021", + "PARANA/10835/2021", + # 2021-1125 long branches + "GANSU_XIFENG/1194/2021", + "GANSU_XIFENG/1143/2021", + "SICHUAN/1208/2021", + "TIANJIN/30/2020", + "WISCONSIN/5/2021", + "WISCONSIN/4/2021", + "NORTH_DAKOTA/12226/2021", + # 2021-1217 potential long branch + "IOWA/1/2021", # --> SWINE/MISSOURI/A01104146/2020 (base), SWINE/MISSOURI/A02525160/2021 + "IOWA/6/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" + "NORTH DAKOTA/12226/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" + # 2021-1217 long branches + "NETHERLANDS/10370-1A/2020", + # 2022-0131 potential long branch + "DENMARK/36/2021", + # 2022-04-26 mess in 156-179 + "INDIA/NIV-NICOBAR91/2020_OR_01A87F80", + "INDIA/NIV-NICOBAR163/2020_OR_823B2A0F", + "INDIA/NIV-NICOBAR106/2020_OR_B81F1DC2", + "INDIA/NIV-NICOBAR164/2020_OR_37A11133", + # 2022-10-07 potential long branch + "PARANA/20675/2022_OR_3D8D671C", + # 2022-12-10 potential long branch + "DAKAR/3124/2021_279E04A6", + # 2022-12-23 potential long branch + "NAVARRA/4050/2022_OR_BBC31B3A", + # 2023-01-13 potential long branch + "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", + # 2023-01-28 potential long branches + "NAVARRA/4050/2022_OR_BBC31B3A", + "DAKAR/3124/2021_279E04A6 pdm09-like", + "PAKISTAN/GIHSN-GB-445/2022_OR_A20D55B6", + "PAKISTAN/GIHSN-ICT-152/2023_OR_572EBEB8", + "PAKISTAN/GIHSN-PB-15/2023_OR_484A375D", + "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", + "CATALONIA/NSVH101996495/2023_OR_2E610047", + # 2023-02-04 potential long branch + "SWITZERLAND/69094/2022_OR_B3C15C9B", + # 2023-0421 potential long branches + "CATALONIA/NSVH510598655/2022_OR_57BCC58B", + "PAKISTAN/1465/2023_OR_1B7B0A1A", + "BADEN-WURTTEMBERG/159/2022_OR_6AB83E7E", + "PARANA/44706/2021_OR_25BFEB40", + # XXX 2023-0515 + "A/PAKISTAN/GIHSN/ICT/1838/2023_0867EDC7", + "A/PAKISTAN/GIHSN/ICT/1634/2023_OR_3D55578F", + "A/PAKISTAN/GIHSN/ICT/1798/2023_OR_C271B3C9", + "A/PAKISTAN/GIHSN/ICT/1682/2023_OR_FB7A0386", + # XXX 2023-0602 + "PAKISTAN/GIHSN-ICT2035/2023_OR_4E73FFF6", + # XXX 2023-0731 + "CYCLADES/215/2022_D164055B", + #long branch & XXXX 2023-0811 + "GREECE/SRIHER-001/2022_OR_E68E256A", + "SAUDI_ARABIA/40/2020_OR_9B1E74A4", + "SAUDI_ARABIA/39/2020_OR_DAE6A100", + # 2023-0822 XXXX + "PARANA/6405/2023_OR_9ACD4010", + "BAHIA/5785/2023_OR_CBFA332A", + # 2023-0903 very long branch + "NETHERLANDS/10534/2023_OR_6221D5BF", + # 2023-09-08 long branch in tree + "BADAJOZ/18570374/2023_58158069", + # 2023-09-15 XXXX + "ALGIERS/251/2023_OR_1A27768", + ]) + .remove_hash_duplicates() + .replace_with_master() + .sort("+date") + .move_name_to_beginning(outgroup) + ) + report(subtype, selected, expected_nuc_size=1647, limit=0) # aa 549 + export(subtype, selected) + +# ---------------------------------------------------------------------- + +def h3(subtype: str, args: argparse.Namespace): + outgroup = "PERTH/16/2009" + selected = ( + ae_backend.seqdb.for_subtype(sSubtypeForSeqdb[subtype]) + .select_all() + .human() + .exclude_with_issue() + .filter_dates(first="2018-03") + .find_masters() # to be able to use .filter below + .include_name([ + outgroup, + "VICTORIA/361/2011", # 2011-10-24 3C.3 + # [2020-09-07] Nearest common ancestor of 2a and 3a in /syn/eu/ac/results/eu/2019-0417-h3-trees/cdc-usa-2009-2019-250-per-year-1.rough-29532.tree.pdf https://notebooks.antigenic-cartography.org/eu/results/eu/2019-0417-h3-trees/ + "MARYLAND/30/2012", # 2012-07-26 3C.3 + # [2020-08-17] Derek thinks that the root (A/STOCKHOLM/6/2014) is not old enough + "TEXAS/50/2012", # 2012-04-15 3C.3 root in MELB tree 2020 + "VERMONT/6/2012", + "STOCKHOLM/6/2014", # 2014-02-06 3a + # [2020-02-07] intermediate strains from nextflu https://nextstrain.org/flu/seasonal/h3n2/ha/2y to keep 3a at the top + "SWITZERLAND/9715293/2013", # 2013-12-06 3a + "NORWAY/466/2014", # 2014-02-03 3a + "SOUTH_AUSTRALIA/55/2014", # 2014-06-29 3a + "TASMANIA/11/2014", # 2014-03-16 3a + "KOBE/63/2014", # 2014-05-21 3a + "PERU/27/2015", # 2015-04-13 3a + "NEVADA/22/2016", # 2016-03-05 3a + "IDAHO/33/2016", # 2016-06-08 3a + "TEXAS/88/2016", # 2016-02-25 3a + "TEXAS/71/2017", # 2017-03-18 3a + "BRAZIL/7331/2018", # 2018-07-09 3a + "KANSAS/14/2017", # 2017-12-14 3a, to have serum circles in the sig page + "HONG_KONG/4801/2014", # 2014-02-26 2a + "HAWAII/47/2014", # 2014-07-18 2a + "NORTH_CAROLINA/4/2017", # 2017-01-26 2a2 + "NORTH_CAROLINA/4/2016", # 2016-01-14 2a1 + "ANTANANARIVO/1067/2016", # 2016-04-06 2a1 + "HONG_KONG/2286/2017", # 2017-05-23 2a1b 135K + "WISCONSIN/327/2017", # 2017-09-22 2a1b 135K + "ABU_DHABI/240/2018", # 2018-01-01 2a1b 135K + "JAMAICA/1447/2018", # 2018-02-19 2a1b 131K + + # Strains before and after T135N substitution to have a better 135N branch placement + # Sarah 2021-02-08 17:05 + "WISCONSIN/85/2016", + "SRI_LANKA/56/2017", + "SOUTH_CAROLINA/4/2017", + "YOKOHAMA/145/2017", + "INDIA/9930/2017", + "HONG_KONG/3118/2017", + "HAWAII/47/2014", + "NIIGATA-C/43/2015", + "DAKAR/17/2016", + # "CAMEROON/16V-9267/2016", # excluded, truncated sequence + "LAOS/3008/2016", + "YUNNAN_LINXIANG/1718/2016", + "HONG_KONG/2302/2016", + "ONTARIO/RV2414/2015", + "ONTARIO/RV2414/2015", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + # XXXXX 2023-0811 + "BAHIA/292038965/2021_70230607", + "BAHIA/292104308/2022_D4526D62", + # XXXXX 2023-0829 + "SOUTH_AFRICA/PET28428/2023_7572E223", + "SOUTH_AFRICA/PET28193/2023_BF46BBB4", + "NORWAY/8084/2023_OR_4548A267", + ]) + .exclude_name([ + "SWINE/", + # long branch (2021-0622) + "WISCONSIN/1/2021", + # long branch (2021-1117) + "HAWAII/28/2020", + "SOUTH_AUSTRALIA/85/2018", + "SOUTH_AUSTRALIA/1/2021", + "MANITOBA/3/2021", + "GUANGXI_GANGBEI/190/2019", + # 2021-1217 potential long branch + "OHIO/6/2021_ISOLATE", + # 2022-0204 potential long branch + "FRANCE/ARA-HCL021219268801/2021", + "FRANCE/ARA-HCL021220304101/2021", + # 2022-0209 long branch, eight X, R in nuc + "ONTARIO/RV2002/2019_P1_0F1E8F1E", + # 2022-0415 potential long branch + "ACRE/181823-IEC/2021", + # 2022-0819 potential long branch + "CATALONIA/NSVH101883267/2022_OR_F79C7813", + # 2022-0825 long branch in the tree + "INDIANA/27/2018_SIAT1_26C3A2A4", + "SWINE/KENTUCKY/18TOSU1939/2018_26C3A2A4", + "SWINE/ILLINOIS/18TOSU1411/2018_26C3A2A4", + # 2022-0826 potential long branch + "FIJI/99/2022_OR_80B95BF8", + "FIJI/102/2022_OR_6AEB79D8", + "WEST_VIRGINIA/25/2022_SIAT1_3E592413", + "WEST_VIRGINIA/26/2022_SIAT1_3E592413", + "WEST_VIRGINIA/24/2022_SIAT1_2FDA4DA3", + # 2022-0902 potential long branch + "FRANCE/ARA-HCL022060609601/2022_65F4520B", + # 2022-10-07 potential long branch + "CATALONIA/NSVH101908168/2022_OR_D5C0176E", + # 2022-10-28 potential long branch + "MICHIGAN/48/2022_OR_910196F9", + # 2022-11-04 potential long branch + "NEW_MEXICO/19/2022_IRX_4BA11056", + # 2022-11-11 potential long branches + "CATALONIA/NSVH534131326/2022_OR_4AC47F3F", + "A/ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_MIXED_ECE1_DCC5FAD7", + "ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_ECE1_DCC5FAD7", + # 2022-11-26 potential long branches + "CATALONIA/NSVH101954411/2022_OR_7A633967", + "CATALONIA/NSVH510598655/2022_OR_57BCC58B", + # 2022-12-02 potential long branches + "MISSOURI/19/2022_OR_837C934E", + # 2022-12-10 potential long branches + "CATALONIA/NSVH560028269/2022_OR_9C189CFA", # 159X 160X 162X 163X 164X + "ATHENS/355/2022_D9D4A86A", # 38X 40X 41X 42X 59X 67X 81X 128X + "MICHIGAN/48/2022_OR_910196F9", + "MICHIGAN/UOM10047284024/2022_OR_910196F9", + # 2023-01-08 potential long branches + "BRITISH_COLUMBIA/PHL-480/2022_OR_EAAE7C01", + # 2023-01-13 potential long branch + "GALICIA/22034482/2022_2EE89148", + "GALICIA/22018524/2022_1EE46082", + "GALICIA/22033804/2022_0EE699D6", + "GALICIA/21987208/2022_F49E5748", + "GALICIA/22036079/2022_430E996A", + "GALICIA/22018743/2022_0116B017", + "GALICIA/22011636/2022_3D9F40AD", + # 2023-01-20 potential long branch + "SINGAPORE/GP9117/2022_OR_69761A51", + "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", + # 2023-01-28 potential long branches + "MICHIGAN/48/2022_OR_910196F9", + "MACEDONIA/IPH-MKD-59729/2021_OR_837D0C9D", + "KHON_KAEN/FS005/2022_OR_1A5637D8", + "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", + "ATHENS/355/2022_D9D4A86", + "CATALONIA/NSVH101994299/2023_OR_13A43DA9", + "MACEDONIA/IPH-MKD-HD/2021_OR_B52D1807", + "SOUTH_AFRICA/R07603/2022_OR_7937FE46", + "SOUTH_AFRICA/R05772/2022_OR_552B5F5E", + "GALICIA/22034482/2022_2EE89148", + "CATALONIA/NSVH560028269/2022_OR_9C189CFA", + # 2023-02-04 potential long branches + "MANITOBA/RV00728/2022_P1_5236154F", + "ONTARIO/198/2022_OR_D1EC3B40", + # 2023-04-21 potential long branches + "ROMANIA/493/2023_612606B9", + "PAKISTAN/1140/2023_OR_3D2D111B", + "CATALONIA/NSVH102045605/2023_OR_7490FFA8", + # XXX & long branch 2023-0731 + "BAHIA/PVM105707/2021_OR_BCE07E46", + "SOUTH_AFRICA/K056301/2023_30E0E639", + # 2023-09-08 potential long branch ?why + "SOUTH_AFRICA/K056301/2023_CC1FE952" + ]) + .remove_hash_duplicates() + .replace_with_master() + .sort("+date") + .move_name_to_beginning(outgroup) + ) + report(subtype, selected, expected_nuc_size=1650, limit=0) + export(subtype, selected) + +# ---------------------------------------------------------------------- + +sSubtypeToFunc = {"BVIC": bvic, "BYAM": byam, "A(H1N1)": h1, "A(H3N2)": h3} + +# ---------------------------------------------------------------------- + +def export(subtype: str, selected :ae_backend.seqdb.Selected): + fasta.write(filename_or_stream=subtype_dir(subtype).joinpath("source.fas"), selected=selected, aa=False) + +# ---------------------------------------------------------------------- + +def subtype_dir(subtype: str): + dir = Path(sSubdirforSubtype[subtype]) + dir.mkdir(exist_ok=True) + return dir + +# ---------------------------------------------------------------------- + +def report(subtype: str, selected :ae_backend.seqdb.Selected, expected_nuc_size: int, limit: int = 20): + print(f"{subtype:7s} {len(selected)}") + for seq in selected: + if len(seq.nuc) != expected_nuc_size: + print(f">> wrong size {len(seq.nuc)} of {seq.seq_id()} (expected: {expected_nuc_size}) ") + for no, seq in enumerate(selected): + if no >= limit: + break + # print(f"{no:4d} {seq.seq_id():40s} [{seq.date()}] {seq.aa}") + print(f"{no:4d} {seq.seq_id():40s} [{seq.date()}]") + +# ====================================================================== + +try: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("subtype", nargs="*", type=lambda src: src.upper()) + parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", default=False) + args = parser.parse_args() + exit_code = main(args) or 0 +except Exception as err: + print(f"> {err}\n{traceback.format_exc()}", file=sys.stderr) + exit_code = 1 +exit(exit_code) + +# ====================================================================== diff --git a/proj/weekly-tree/garli-h3 b/proj/weekly-tree/garli-h3 new file mode 100755 index 00000000..49c0e631 --- /dev/null +++ b/proj/weekly-tree/garli-h3 @@ -0,0 +1,452 @@ +#! /usr/bin/env python3 + +import sys, os, re, argparse, logging, time, datetime, concurrent.futures, subprocess, random, traceback +from pathlib import Path + +# ====================================================================== + +sSubtypeNorm = {"BV": "BVIC", "BVIC": "BVIC", "BY": "BYAM", "BYAM": "BYAM", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": "h3"} + +sRandomGen = random.SystemRandom() + +# ---------------------------------------------------------------------- + +def main(args: argparse.Namespace): + argv = '"' + '" "'.join(sys.argv) + '"' + working_dir = Path(args.subdir).resolve() + if args.start_tree: + tree = args.start_tree.resolve() + else: + tree = Path(working_dir.joinpath("raxml-best.txt").open().read().strip()).resolve() + if args.source_fasta: + source_fasta = args.source_fasta.resolve() + else: + source_fasta = working_dir.joinpath("source.fas").resolve() + try: + best_tree_file = garli(working_dir=working_dir, tree=tree, source_fas=source_fasta, num_runs=args.num_runs, stoptime=args.stoptime, slurm_nodelist=args.slurm_nodelist, slurm_exclude_nodes=args.slurm_exclude_nodes, new_output=args.new_output) + subprocess.run(["/usr/bin/mail", "-s", f"completed {sys.argv[0]} {working_dir}", "weekly-tree@antigenic-cartography.org"], input=f"cd {working_dir}\n{argv}\n{best_tree_file}\n", text=True) + return 0 + except Exception as err: + subprocess.run(["/usr/bin/mail", "-s", f"FAILED {sys.argv[0]} {working_dir}", "weekly-tree@antigenic-cartography.org"], input=f"cd {working_dir}\n{argv}\n\n{traceback.format_exc()}\n", text=True) + raise + +# ---------------------------------------------------------------------- + +def garli(working_dir: Path, tree: Path, source_fas: Path, num_runs: int, stoptime: int, slurm_nodelist: str, slurm_exclude_nodes: str, new_output: bool): + + default_required_memory = 15000 + + # the number of attachment branches evaluated for each taxon to be added to the tree during the creation of an ML stepwise-addition starting tree (when garli is run without raxml step) + attachmentspertaxon = 1000000 + # termination condition: when no new significantly better scoring topology has been encountered in greater than this number of generations, garli stops + genthreshfortopoterm = 20000 + + output_dir = get_output_dir(working_dir, "garli.", new_output=new_output) + + garli_cmd = "/syn/bin/garli" + srun_cmd = [ + "srun", + "--cpus-per-task=2", + # "--nodes=1", + "--ntasks=1", + "--threads=1", + f"--nodelist={slurm_nodelist}", + ] + if slurm_exclude_nodes: + srun_cmd.append(f"--exclude={slurm_exclude_nodes}") + + # ---------------------------------------------------------------------- + + def make_conf(run_id, required_memory): + """Returns filename of the written conf file""" + # if not outgroup or not isinstance(outgroup, list) or not all(isinstance(e, int) and e > 0 for e in outgroup): + # raise ValueError("outgroup must be non-empty list of taxa indices in the fasta file starting with 1") + garli_args = { + "source": str(source_fas.resolve()), + "availablememory": required_memory or default_required_memory, + "streefname": str(tree.resolve()), + "output_prefix": str(output_dir.joinpath(run_id)), + "attachmentspertaxon": attachmentspertaxon, + "randseed": sRandomGen.randint(1, 0xFFFFFFF), + "genthreshfortopoterm": genthreshfortopoterm, + "searchreps": 1, + "stoptime": stoptime, + "outgroup": "1" # " ".join(str(e) for e in outgroup), + } + conf = GARLI_CONF.format(**garli_args) + conf_filename = output_dir.joinpath(run_id + ".garli.conf") + with conf_filename.open("w") as f: + f.write(conf) + return conf_filename + + # ---------------------------------------------------------------------- + + def find_memory_requirements(): + return default_required_memory + +# sGreat = re.compile(r"\s*great\s+>=\s+(\d+)\s+MB", re.I) +# conf_file = make_conf(run_id="find_memory_requirements", required_memory=None) +# start = time.time() +# proc = subprocess.Popen([garli_cmd, str(conf_file)], stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) +# required_memory = None # default, in case the code below fails or times out +# timeout = 300 +# while required_memory is None: +# line = proc.stdout.readline().decode("utf-8") +# if mm := sGreat.match(line): +# required_memory = int(mm.group(1)) +# elif (time.time() - start) > timeout: +# required_memory = default_required_memory +# print(f"WARNING: Cannot obtain memory required by garli in {timeout} seconds: default for {working_dir.name}: {required_memory} will be used", file=sys.stderr) +# proc.kill() +# return required_memory + + # ---------------------------------------------------------------------- + + def run_garli(run_id, required_memory): + conf_file = make_conf(run_id=run_id, required_memory=required_memory) + # --mem-per-cpu is half of required_memory because we allocate whole 2 cpus per run to prevent from hyperthreading + cmd = (srun_cmd + [f"--mem-per-cpu={int(required_memory / 2) + 1}", f"--job-name='garli {working_dir.name} {run_id}'", f"--output={output_dir.joinpath(run_id).with_suffix('.stdout')}", f"--error={output_dir.joinpath(run_id).with_suffix('.stderr')}"] + + [garli_cmd, str(conf_file)]) + with output_dir.joinpath("commands.txt").open("a") as commands_txt: + print(" ".join(cmd), file=commands_txt) + # print(cmd) + subprocess.check_call(cmd) + + # ---------------------------------------------------------------------- + + def submit(): + required_memory = find_memory_requirements() + failures = 0 + with concurrent.futures.ThreadPoolExecutor() as executor: + future_to_runid = {executor.submit(run_garli, run_id, required_memory): run_id for run_id in (f"{ri:03d}" for ri in range(num_runs))} + for future in concurrent.futures.as_completed(future_to_runid): + try: + future.result() + except Exception as err: + logging.error(f"garli {working_dir.name} {future_to_runid[future]} FAILED: {err}") + failures += 1 + else: + logging.info(f"garli run {working_dir.name} {future_to_runid[future]} completed") + if failures: + logging.error(f"garli {working_dir.name} FAILED: {failures} runs failed") + else: + logging.info("garli completed") + + # ---------------------------------------------------------------------- + + def results_available(): + return len(list(output_dir.glob("*.best.phy"))) > (num_runs / 2) + + # ---------------------------------------------------------------------- + + sReFinalScore = re.compile(r"Final\t-(\d+\.\d+)") + + def find_best_result(): + # scores in results are positive numbers + results = [[log_file_name, float(sReFinalScore.search(log_file_name.open().read()).group(1))] for log_file_name in output_dir.glob("*.log00.log")] + if not results: + raise RuntimeError(f"no results found in {output_dir}/*.log00.log") + results.sort(key=lambda en: en[1]) + output_dir.parent.joinpath("garli-best.txt").open("w").write(str(results[0][0]).replace(".log00.log", ".best.phy")) + output_dir.parent.joinpath("garli-all.txt").open("w").write("\n".join(f"{score:13.6f} {str(log_name).replace('.log00.log', '.best.phy')}" for log_name, score in results)) + return [Path(str(results[0][0]).replace(".log00.log", ".best.phy")), results[0][1]] + + # ---------------------------------------------------------------------- + + if not results_available(): + start = datetime.datetime.now() + submit() + logging.info(f"[{working_dir}]: garli elapsed: {datetime.datetime.now() - start}") + best_tree_file, best_score = find_best_result() + logging.info(f"[{working_dir}]: garli best: {best_tree_file} score: {best_score}") + return best_tree_file + +# ---------------------------------------------------------------------- + +GARLI_CONF = """\ +[general] +datafname = {source} +streefname = {streefname} +# Prefix of all output filenames, such as log, treelog, etc. Change +# this for each run that you do or the program will overwrite previous +# results. +ofprefix = {output_prefix} + +constraintfile = none + +# (1 to infinity) The number of attachment branches evaluated for each +# taxon to be added to the tree during the creation of an ML +# stepwise-addition starting tree. Briefly, stepwise addition is an +# algorithm used to make a tree, and involves adding taxa in a random +# order to a growing tree. For each taxon to be added, a number of +# randomly chosen attachment branches are tried and scored, and then +# the best scoring one is chosen as the location of that taxon. The +# attachmentspertaxon setting controls how many attachment points are +# evaluated for each taxon to be added. A value of one is equivalent +# to a completely random tree (only one randomly chosen location is +# evaluated). A value of greater than 2 times the number of taxa in +# the dataset means that all attachment points will be evaluated for +# each taxon, and will result in very good starting trees (but may +# take a while on large datasets). Even fairly small values (< 10) can +# result in starting trees that are much, much better than random, but +# still fairly different from one another. Default: 50. +attachmentspertaxon = {attachmentspertaxon} + +# -1 - random seed chosen automatically +randseed = {randseed} + +# in Mb +availablememory = {availablememory} + +# The frequency with which the best score is written to the log file, default: 10 +logevery = 1000 + +# Whether to write three files to disk containing all information +# about the current state of the population every saveevery +# generations, with each successive checkpoint overwriting the +# previous one. These files can be used to restart a run at the last +# written checkpoint by setting the restart configuration entry. Default: 0 +writecheckpoints = 0 + +# Whether to restart at a previously saved checkpoint. To use this +# option the writecheckpoints option must have been used during a +# previous run. The program will look for checkpoint files that are +# named based on the ofprefix of the previous run. If you intend to +# restart a run, NOTHING should be changed in the config file except +# setting restart to 1. A run that is restarted from checkpoint will +# give exactly the same results it would have if the run had gone to +# completion. Default: 0 +restart = 0 + +# If writecheckpoints or outputcurrentbesttopology are specified, this +# is the frequency (in generations) at which checkpoints or the +# current best tree are written to file. default: 100 +saveevery = 1000 + +# Specifies whether some initial rough optimization is performed on +# the starting branch lengths and alpha parameter. This is always +# recommended. Default: 1. +refinestart = 1 + +# If true, the current best tree of the current search replicate is +# written to .best.current.tre every saveevery +# generations. In versions before 0.96 the current best topology was +# always written to file, but that is no longer the case. Seeing the +# current best tree has no real use apart from satisfying your +# curiosity about how a run is going. Default: 0. +outputcurrentbesttopology = 0 + +# If true, each new topology encountered with a better score than the +# previous best is written to file. In some cases this can result in +# really big files (hundreds of MB) though, especially for random +# starting topologies on large datasets. Note that this file is +# interesting to get an idea of how the topology changed as the +# searches progressed, but the collection of trees should NOT be +# interpreted in any meaningful way. This option is not available +# while bootstrapping. Default: 0. +outputeachbettertopology = 0 + +# Specifies whether the automatic termination conditions will be +# used. The conditions specified by both of the following two +# parameters must be met. See the following two parameters for their +# definitions. If this is false, the run will continue until it +# reaches the time (stoptime) or generation (stopgen) limit. It is +# highly recommended that this option be used! Default: 1. +enforcetermconditions = 1 + +# This specifies the first part of the termination condition. When no +# new significantly better scoring topology (see significanttopochange +# below) has been encountered in greater than this number of +# generations, this condition is met. Increasing this parameter may +# improve the lnL scores obtained (especially on large datasets), but +# will also increase runtimes. Default: 20000. +genthreshfortopoterm = {genthreshfortopoterm} + +# The second part of the termination condition. When the total +# improvement in score over the last intervallength x intervalstostore +# generations (default is 500 generations, see below) is less than +# this value, this condition is met. This does not usually need to be +# changed. Default: 0.05 +scorethreshforterm = 0.05 + +# The lnL increase required for a new topology to be considered +# significant as far as the termination condition is concerned. It +# probably doesn’t need to be played with, but you might try +# increasing it slightly if your runs reach a stable score and then +# take a very long time to terminate due to very minor changes in +# topology. Default: 0.01 +significanttopochange = 0.01 + +# Whether a phylip formatted tree files will be output in addition to +# the default nexus files for the best tree across all replicates +# (.best.phy), the best tree for each replicate +# (.best.all.phy) or in the case of bootstrapping, the best +# tree for each bootstrap replicate (). +# We use .phy (it's newick tree format), use 1 here! +outputphyliptree = 1 + +# Whether to output three files of little general interest: the +# “fate”, “problog” and “swaplog” files. The fate file shows the +# parentage, mutation types and scores of every individual in the +# population during the entire search. The problog shows how the +# proportions of the different mutation types changed over the course +# of the run. The swaplog shows the number of unique swaps and the +# number of total swaps on the current best tree over the course of +# the run. Default: 0 +outputmostlyuselessfiles = 0 + +# This option allow for orienting the tree topologies in a consistent +# way when they are written to file. Note that this has NO effect +# whatsoever on the actual inference and the specified outgroup is NOT +# constrained to be present in the inferred trees. If multiple +# outgroup taxa are specified and they do not form a monophyletic +# group, this setting will be ignored. If you specify a single +# outgroup taxon it will always be present, and the tree will always +# be consistently oriented. To specify an outgroup consisting of taxa +# 1, 3 and 5 the format is this: outgroup = 1 3 5 +outgroup = {outgroup} + +resampleproportion = 1.0 +inferinternalstateprobs = 0 +outputsitelikelihoods = 0 +optimizeinputonly = 0 +collapsebranches = 1 + +# The number of independent search replicates to perform during a +# program execution. You should always either do multiple search +# replicates or multiple program executions with any dataset to get a +# feel for whether you are getting consistent results, which suggests +# that the program is doing a decent job of searching. Note that if +# this is > 1 and you are performing a bootstrap analysis, this is the +# number of search replicates to be done per bootstrap replicate. That +# can increase the chance of finding the best tree per bootstrap +# replicate, but will also increase bootstrap runtimes +# enormously. Default: 2 +searchreps = {searchreps} + +bootstrapreps = 0 + +# ------------------- FOR NUCLEOTIDES -------------- +datatype=nucleotide + +# The number of relative substitution rate parameters (note that the +# number of free parameters is this value minus one). Equivalent to +# the “nst” setting in PAUP* and MrBayes. 1rate assumes that +# substitutions between all pairs of nucleotides occur at the same +# rate, 2rate allows different rates for transitions and +# transversions, and 6rate allows a different rate between each +# nucleotide pair. These rates are estimated unless the fixed option +# is chosen. New in version 0.96, parameters for any submodel of the +# GTR model may now be estimated. The format for specifying this is +# very similar to that used in the “rclass’ setting of PAUP*. Within +# parentheses, six letters are specified, with spaces between +# them. The six letters represent the rates of substitution between +# the six pairs of nucleotides, with the order being A-C, A-G, A-T, +# C-G, C-T and G-T. Letters within the parentheses that are the same +# mean that a single parameter is shared by multiple nucleotide +# pairs. For example, ratematrix = (a b a a b a) would specify the HKY +# 2-rate model (equivalent to ratematrix = 2rate). This entry, +# ratematrix = (a b c c b a) would specify 3 estimated rates of +# subtitution, with one rate shared by A-C and G-T substitutions, +# another rate shared by A-G and C-T substitutions, and the final rate +# shared by A-T and C-G substitutions. Default: 6rate +ratematrix = 6rate + +# (equal, empirical, estimate, fixed) Specifies how the equilibrium +# state frequencies (A, C, G and T) are treated. The empirical setting +# fixes the frequencies at their observed proportions, and the other +# options should be self-explanatory. Default: estimate +statefrequencies = estimate + +# (none, gamma, gammafixed) – The model of rate heterogeneity +# assumed. “gammafixed” requires that the alpha shape parameter is +# provided, and a setting of “gamma” estimates it. Default: gamma +ratehetmodel = gamma + +# (1 to 20) – The number of categories of variable rates (not +# including the invariant site class if it is being used). Must be set +# to 1 if ratehetmodel is set to none. Note that runtimes and memory +# usage scale linearly with this setting. Default: 4 +numratecats = 4 + +# (none, estimate, fixed) Specifies whether a parameter representing +# the proportion of sites that are unable to change (i.e. have a +# substitution rate of zero) will be included. This is typically +# referred to as “invariant sites”, but would better be termed +# “invariable sites”. Default: estimate +invariantsites = estimate + +# ---------------------------------------------------------------------- + +[master] +nindivs = 4 +holdover = 1 +selectionintensity = .5 +holdoverpenalty = 0 + +# The maximum number of generations to run. Note that this supersedes +# the automated stopping criterion (see enforcetermconditions above), +# and should therefore be set to a very large value if automatic +# termination is desired. +stopgen = 1000000 + +# The maximum number of seconds for the run to continue. Note that +# this supersedes the automated stopping criterion (see +# enforcetermconditions above), and should therefore be set to a very +# large value if automatic termination is desired. +stoptime = {stoptime} + +startoptprec = 0.5 +minoptprec = 0.01 +numberofprecreductions = 20 +treerejectionthreshold = 50.0 +topoweight = 1.0 +modweight = 0.05 +brlenweight = 0.2 +randnniweight = 0.1 +randsprweight = 0.3 +limsprweight = 0.6 +intervallength = 100 +intervalstostore = 5 + +limsprrange = 6 +meanbrlenmuts = 5 +gammashapebrlen = 1000 +gammashapemodel = 1000 +uniqueswapbias = 0.1 +distanceswapbias = 1.0 +""" + +# ---------------------------------------------------------------------- + +def get_output_dir(working_dir: Path, prefix: str, new_output: bool): + if not new_output and (output_dirs := list(working_dir.glob(prefix + "*"))): + output_dir = max(output_dirs) + else: + output_dir = working_dir.joinpath(prefix + datetime.datetime.now().strftime('%Y-%m%d-%H%M%S')) + output_dir.mkdir() + output_dir.chmod(0o777) + return output_dir + +# ====================================================================== + +try: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("subdir") + parser.add_argument("-n", "--num-runs", type=int, default=16) + parser.add_argument("-s", "--stoptime", default=60*60, type=int, help="termination condition: the maximum number of seconds for the run to continue") + parser.add_argument("-t", "--start-tree", default=None, type=Path) + parser.add_argument("-f", "--source-fasta", default=None, type=Path) + parser.add_argument("--new-output", action="store_true", default=False) + parser.add_argument("--node", dest="slurm_nodelist", default="i22", help="one node to run on, to run on multiple nodes use --exclude-nodes") + parser.add_argument("--exclude-nodes", dest="slurm_exclude_nodes", default=None) + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG, format="%(levelname)s %(asctime)s: %(message)s") + exit_code = main(args) or 0 +except Exception as err: + logging.error(f"> {err}\n{traceback.format_exc()}") + exit_code = 1 +exit(exit_code) + +# ====================================================================== diff --git a/proj/weekly-tree/garli-memvar b/proj/weekly-tree/garli-memvar new file mode 100755 index 00000000..35dd4eef --- /dev/null +++ b/proj/weekly-tree/garli-memvar @@ -0,0 +1,452 @@ +#! /usr/bin/env python3 + +import sys, os, re, argparse, logging, time, datetime, concurrent.futures, subprocess, random, traceback +from pathlib import Path + +# ====================================================================== + +sSubtypeNorm = {"BV": "BVIC", "BVIC": "BVIC", "BY": "BYAM", "BYAM": "BYAM", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": "h3"} + +sRandomGen = random.SystemRandom() + +# ---------------------------------------------------------------------- + +def main(args: argparse.Namespace): + argv = '"' + '" "'.join(sys.argv) + '"' + working_dir = Path(args.subdir).resolve() + if args.start_tree: + tree = args.start_tree.resolve() + else: + tree = Path(working_dir.joinpath("raxml-best.txt").open().read().strip()).resolve() + if args.source_fasta: + source_fasta = args.source_fasta.resolve() + else: + source_fasta = working_dir.joinpath("source.fas").resolve() + try: + best_tree_file = garli(working_dir=working_dir, tree=tree, source_fas=source_fasta, num_runs=args.num_runs, stoptime=args.stoptime, slurm_nodelist=args.slurm_nodelist, slurm_exclude_nodes=args.slurm_exclude_nodes, new_output=args.new_output) + subprocess.run(["/usr/bin/mail", "-s", f"completed {sys.argv[0]} {working_dir}", "weekly-tree@antigenic-cartography.org"], input=f"cd {working_dir}\n{argv}\n{best_tree_file}\n", text=True) + return 0 + except Exception as err: + subprocess.run(["/usr/bin/mail", "-s", f"FAILED {sys.argv[0]} {working_dir}", "weekly-tree@antigenic-cartography.org"], input=f"cd {working_dir}\n{argv}\n\n{traceback.format_exc()}\n", text=True) + raise + +# ---------------------------------------------------------------------- + +def garli(working_dir: Path, tree: Path, source_fas: Path, num_runs: int, stoptime: int, slurm_nodelist: str, slurm_exclude_nodes: str, new_output: bool): + + default_required_memory = 15800 + + # the number of attachment branches evaluated for each taxon to be added to the tree during the creation of an ML stepwise-addition starting tree (when garli is run without raxml step) + attachmentspertaxon = 1000000 + # termination condition: when no new significantly better scoring topology has been encountered in greater than this number of generations, garli stops + genthreshfortopoterm = 20000 + + output_dir = get_output_dir(working_dir, "garli.", new_output=new_output) + + garli_cmd = "/syn/bin/garli" + srun_cmd = [ + "srun", + "--cpus-per-task=2", + # "--nodes=1", + "--ntasks=1", + "--threads=1", + f"--nodelist={slurm_nodelist}", + ] + if slurm_exclude_nodes: + srun_cmd.append(f"--exclude={slurm_exclude_nodes}") + + # ---------------------------------------------------------------------- + + def make_conf(run_id, required_memory): + """Returns filename of the written conf file""" + # if not outgroup or not isinstance(outgroup, list) or not all(isinstance(e, int) and e > 0 for e in outgroup): + # raise ValueError("outgroup must be non-empty list of taxa indices in the fasta file starting with 1") + garli_args = { + "source": str(source_fas.resolve()), + "availablememory": required_memory or default_required_memory, + "streefname": str(tree.resolve()), + "output_prefix": str(output_dir.joinpath(run_id)), + "attachmentspertaxon": attachmentspertaxon, + "randseed": sRandomGen.randint(1, 0xFFFFFFF), + "genthreshfortopoterm": genthreshfortopoterm, + "searchreps": 1, + "stoptime": stoptime, + "outgroup": "1" # " ".join(str(e) for e in outgroup), + } + conf = GARLI_CONF.format(**garli_args) + conf_filename = output_dir.joinpath(run_id + ".garli.conf") + with conf_filename.open("w") as f: + f.write(conf) + return conf_filename + + # ---------------------------------------------------------------------- + + def find_memory_requirements(): + return default_required_memory + +# sGreat = re.compile(r"\s*great\s+>=\s+(\d+)\s+MB", re.I) +# conf_file = make_conf(run_id="find_memory_requirements", required_memory=None) +# start = time.time() +# proc = subprocess.Popen([garli_cmd, str(conf_file)], stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) +# required_memory = None # default, in case the code below fails or times out +# timeout = 300 +# while required_memory is None: +# line = proc.stdout.readline().decode("utf-8") +# if mm := sGreat.match(line): +# required_memory = int(mm.group(1)) +# elif (time.time() - start) > timeout: +# required_memory = default_required_memory +# print(f"WARNING: Cannot obtain memory required by garli in {timeout} seconds: default for {working_dir.name}: {required_memory} will be used", file=sys.stderr) +# proc.kill() +# return required_memory + + # ---------------------------------------------------------------------- + + def run_garli(run_id, required_memory): + conf_file = make_conf(run_id=run_id, required_memory=required_memory) + # --mem-per-cpu is half of required_memory because we allocate whole 2 cpus per run to prevent from hyperthreading + cmd = (srun_cmd + [f"--mem-per-cpu={int(required_memory / 2) + 1}", f"--job-name='garli {working_dir.name} {run_id}'", f"--output={output_dir.joinpath(run_id).with_suffix('.stdout')}", f"--error={output_dir.joinpath(run_id).with_suffix('.stderr')}"] + + [garli_cmd, str(conf_file)]) + with output_dir.joinpath("commands.txt").open("a") as commands_txt: + print(" ".join(cmd), file=commands_txt) + # print(cmd) + subprocess.check_call(cmd) + + # ---------------------------------------------------------------------- + + def submit(): + required_memory = find_memory_requirements() + failures = 0 + with concurrent.futures.ThreadPoolExecutor() as executor: + future_to_runid = {executor.submit(run_garli, run_id, required_memory): run_id for run_id in (f"{ri:03d}" for ri in range(num_runs))} + for future in concurrent.futures.as_completed(future_to_runid): + try: + future.result() + except Exception as err: + logging.error(f"garli {working_dir.name} {future_to_runid[future]} FAILED: {err}") + failures += 1 + else: + logging.info(f"garli run {working_dir.name} {future_to_runid[future]} completed") + if failures: + logging.error(f"garli {working_dir.name} FAILED: {failures} runs failed") + else: + logging.info("garli completed") + + # ---------------------------------------------------------------------- + + def results_available(): + return len(list(output_dir.glob("*.best.phy"))) > (num_runs / 2) + + # ---------------------------------------------------------------------- + + sReFinalScore = re.compile(r"Final\t-(\d+\.\d+)") + + def find_best_result(): + # scores in results are positive numbers + results = [[log_file_name, float(sReFinalScore.search(log_file_name.open().read()).group(1))] for log_file_name in output_dir.glob("*.log00.log")] + if not results: + raise RuntimeError(f"no results found in {output_dir}/*.log00.log") + results.sort(key=lambda en: en[1]) + output_dir.parent.joinpath("garli-best.txt").open("w").write(str(results[0][0]).replace(".log00.log", ".best.phy")) + output_dir.parent.joinpath("garli-all.txt").open("w").write("\n".join(f"{score:13.6f} {str(log_name).replace('.log00.log', '.best.phy')}" for log_name, score in results)) + return [Path(str(results[0][0]).replace(".log00.log", ".best.phy")), results[0][1]] + + # ---------------------------------------------------------------------- + + if not results_available(): + start = datetime.datetime.now() + submit() + logging.info(f"[{working_dir}]: garli elapsed: {datetime.datetime.now() - start}") + best_tree_file, best_score = find_best_result() + logging.info(f"[{working_dir}]: garli best: {best_tree_file} score: {best_score}") + return best_tree_file + +# ---------------------------------------------------------------------- + +GARLI_CONF = """\ +[general] +datafname = {source} +streefname = {streefname} +# Prefix of all output filenames, such as log, treelog, etc. Change +# this for each run that you do or the program will overwrite previous +# results. +ofprefix = {output_prefix} + +constraintfile = none + +# (1 to infinity) The number of attachment branches evaluated for each +# taxon to be added to the tree during the creation of an ML +# stepwise-addition starting tree. Briefly, stepwise addition is an +# algorithm used to make a tree, and involves adding taxa in a random +# order to a growing tree. For each taxon to be added, a number of +# randomly chosen attachment branches are tried and scored, and then +# the best scoring one is chosen as the location of that taxon. The +# attachmentspertaxon setting controls how many attachment points are +# evaluated for each taxon to be added. A value of one is equivalent +# to a completely random tree (only one randomly chosen location is +# evaluated). A value of greater than 2 times the number of taxa in +# the dataset means that all attachment points will be evaluated for +# each taxon, and will result in very good starting trees (but may +# take a while on large datasets). Even fairly small values (< 10) can +# result in starting trees that are much, much better than random, but +# still fairly different from one another. Default: 50. +attachmentspertaxon = {attachmentspertaxon} + +# -1 - random seed chosen automatically +randseed = {randseed} + +# in Mb +availablememory = {availablememory} + +# The frequency with which the best score is written to the log file, default: 10 +logevery = 1000 + +# Whether to write three files to disk containing all information +# about the current state of the population every saveevery +# generations, with each successive checkpoint overwriting the +# previous one. These files can be used to restart a run at the last +# written checkpoint by setting the restart configuration entry. Default: 0 +writecheckpoints = 0 + +# Whether to restart at a previously saved checkpoint. To use this +# option the writecheckpoints option must have been used during a +# previous run. The program will look for checkpoint files that are +# named based on the ofprefix of the previous run. If you intend to +# restart a run, NOTHING should be changed in the config file except +# setting restart to 1. A run that is restarted from checkpoint will +# give exactly the same results it would have if the run had gone to +# completion. Default: 0 +restart = 0 + +# If writecheckpoints or outputcurrentbesttopology are specified, this +# is the frequency (in generations) at which checkpoints or the +# current best tree are written to file. default: 100 +saveevery = 1000 + +# Specifies whether some initial rough optimization is performed on +# the starting branch lengths and alpha parameter. This is always +# recommended. Default: 1. +refinestart = 1 + +# If true, the current best tree of the current search replicate is +# written to .best.current.tre every saveevery +# generations. In versions before 0.96 the current best topology was +# always written to file, but that is no longer the case. Seeing the +# current best tree has no real use apart from satisfying your +# curiosity about how a run is going. Default: 0. +outputcurrentbesttopology = 0 + +# If true, each new topology encountered with a better score than the +# previous best is written to file. In some cases this can result in +# really big files (hundreds of MB) though, especially for random +# starting topologies on large datasets. Note that this file is +# interesting to get an idea of how the topology changed as the +# searches progressed, but the collection of trees should NOT be +# interpreted in any meaningful way. This option is not available +# while bootstrapping. Default: 0. +outputeachbettertopology = 0 + +# Specifies whether the automatic termination conditions will be +# used. The conditions specified by both of the following two +# parameters must be met. See the following two parameters for their +# definitions. If this is false, the run will continue until it +# reaches the time (stoptime) or generation (stopgen) limit. It is +# highly recommended that this option be used! Default: 1. +enforcetermconditions = 1 + +# This specifies the first part of the termination condition. When no +# new significantly better scoring topology (see significanttopochange +# below) has been encountered in greater than this number of +# generations, this condition is met. Increasing this parameter may +# improve the lnL scores obtained (especially on large datasets), but +# will also increase runtimes. Default: 20000. +genthreshfortopoterm = {genthreshfortopoterm} + +# The second part of the termination condition. When the total +# improvement in score over the last intervallength x intervalstostore +# generations (default is 500 generations, see below) is less than +# this value, this condition is met. This does not usually need to be +# changed. Default: 0.05 +scorethreshforterm = 0.05 + +# The lnL increase required for a new topology to be considered +# significant as far as the termination condition is concerned. It +# probably doesn’t need to be played with, but you might try +# increasing it slightly if your runs reach a stable score and then +# take a very long time to terminate due to very minor changes in +# topology. Default: 0.01 +significanttopochange = 0.01 + +# Whether a phylip formatted tree files will be output in addition to +# the default nexus files for the best tree across all replicates +# (.best.phy), the best tree for each replicate +# (.best.all.phy) or in the case of bootstrapping, the best +# tree for each bootstrap replicate (). +# We use .phy (it's newick tree format), use 1 here! +outputphyliptree = 1 + +# Whether to output three files of little general interest: the +# “fate”, “problog” and “swaplog” files. The fate file shows the +# parentage, mutation types and scores of every individual in the +# population during the entire search. The problog shows how the +# proportions of the different mutation types changed over the course +# of the run. The swaplog shows the number of unique swaps and the +# number of total swaps on the current best tree over the course of +# the run. Default: 0 +outputmostlyuselessfiles = 0 + +# This option allow for orienting the tree topologies in a consistent +# way when they are written to file. Note that this has NO effect +# whatsoever on the actual inference and the specified outgroup is NOT +# constrained to be present in the inferred trees. If multiple +# outgroup taxa are specified and they do not form a monophyletic +# group, this setting will be ignored. If you specify a single +# outgroup taxon it will always be present, and the tree will always +# be consistently oriented. To specify an outgroup consisting of taxa +# 1, 3 and 5 the format is this: outgroup = 1 3 5 +outgroup = {outgroup} + +resampleproportion = 1.0 +inferinternalstateprobs = 0 +outputsitelikelihoods = 0 +optimizeinputonly = 0 +collapsebranches = 1 + +# The number of independent search replicates to perform during a +# program execution. You should always either do multiple search +# replicates or multiple program executions with any dataset to get a +# feel for whether you are getting consistent results, which suggests +# that the program is doing a decent job of searching. Note that if +# this is > 1 and you are performing a bootstrap analysis, this is the +# number of search replicates to be done per bootstrap replicate. That +# can increase the chance of finding the best tree per bootstrap +# replicate, but will also increase bootstrap runtimes +# enormously. Default: 2 +searchreps = {searchreps} + +bootstrapreps = 0 + +# ------------------- FOR NUCLEOTIDES -------------- +datatype=nucleotide + +# The number of relative substitution rate parameters (note that the +# number of free parameters is this value minus one). Equivalent to +# the “nst” setting in PAUP* and MrBayes. 1rate assumes that +# substitutions between all pairs of nucleotides occur at the same +# rate, 2rate allows different rates for transitions and +# transversions, and 6rate allows a different rate between each +# nucleotide pair. These rates are estimated unless the fixed option +# is chosen. New in version 0.96, parameters for any submodel of the +# GTR model may now be estimated. The format for specifying this is +# very similar to that used in the “rclass’ setting of PAUP*. Within +# parentheses, six letters are specified, with spaces between +# them. The six letters represent the rates of substitution between +# the six pairs of nucleotides, with the order being A-C, A-G, A-T, +# C-G, C-T and G-T. Letters within the parentheses that are the same +# mean that a single parameter is shared by multiple nucleotide +# pairs. For example, ratematrix = (a b a a b a) would specify the HKY +# 2-rate model (equivalent to ratematrix = 2rate). This entry, +# ratematrix = (a b c c b a) would specify 3 estimated rates of +# subtitution, with one rate shared by A-C and G-T substitutions, +# another rate shared by A-G and C-T substitutions, and the final rate +# shared by A-T and C-G substitutions. Default: 6rate +ratematrix = 6rate + +# (equal, empirical, estimate, fixed) Specifies how the equilibrium +# state frequencies (A, C, G and T) are treated. The empirical setting +# fixes the frequencies at their observed proportions, and the other +# options should be self-explanatory. Default: estimate +statefrequencies = estimate + +# (none, gamma, gammafixed) – The model of rate heterogeneity +# assumed. “gammafixed” requires that the alpha shape parameter is +# provided, and a setting of “gamma” estimates it. Default: gamma +ratehetmodel = gamma + +# (1 to 20) – The number of categories of variable rates (not +# including the invariant site class if it is being used). Must be set +# to 1 if ratehetmodel is set to none. Note that runtimes and memory +# usage scale linearly with this setting. Default: 4 +numratecats = 4 + +# (none, estimate, fixed) Specifies whether a parameter representing +# the proportion of sites that are unable to change (i.e. have a +# substitution rate of zero) will be included. This is typically +# referred to as “invariant sites”, but would better be termed +# “invariable sites”. Default: estimate +invariantsites = estimate + +# ---------------------------------------------------------------------- + +[master] +nindivs = 4 +holdover = 1 +selectionintensity = .5 +holdoverpenalty = 0 + +# The maximum number of generations to run. Note that this supersedes +# the automated stopping criterion (see enforcetermconditions above), +# and should therefore be set to a very large value if automatic +# termination is desired. +stopgen = 1000000 + +# The maximum number of seconds for the run to continue. Note that +# this supersedes the automated stopping criterion (see +# enforcetermconditions above), and should therefore be set to a very +# large value if automatic termination is desired. +stoptime = {stoptime} + +startoptprec = 0.5 +minoptprec = 0.01 +numberofprecreductions = 20 +treerejectionthreshold = 50.0 +topoweight = 1.0 +modweight = 0.05 +brlenweight = 0.2 +randnniweight = 0.1 +randsprweight = 0.3 +limsprweight = 0.6 +intervallength = 100 +intervalstostore = 5 + +limsprrange = 6 +meanbrlenmuts = 5 +gammashapebrlen = 1000 +gammashapemodel = 1000 +uniqueswapbias = 0.1 +distanceswapbias = 1.0 +""" + +# ---------------------------------------------------------------------- + +def get_output_dir(working_dir: Path, prefix: str, new_output: bool): + if not new_output and (output_dirs := list(working_dir.glob(prefix + "*"))): + output_dir = max(output_dirs) + else: + output_dir = working_dir.joinpath(prefix + datetime.datetime.now().strftime('%Y-%m%d-%H%M%S')) + output_dir.mkdir() + output_dir.chmod(0o777) + return output_dir + +# ====================================================================== + +try: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("subdir") + parser.add_argument("-n", "--num-runs", type=int, default=16) + parser.add_argument("-s", "--stoptime", default=60*60, type=int, help="termination condition: the maximum number of seconds for the run to continue") + parser.add_argument("-t", "--start-tree", default=None, type=Path) + parser.add_argument("-f", "--source-fasta", default=None, type=Path) + parser.add_argument("--new-output", action="store_true", default=False) + parser.add_argument("--node", dest="slurm_nodelist", default="i22", help="one node to run on, to run on multiple nodes use --exclude-nodes") + parser.add_argument("--exclude-nodes", dest="slurm_exclude_nodes", default=None) + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG, format="%(levelname)s %(asctime)s: %(message)s") + exit_code = main(args) or 0 +except Exception as err: + logging.error(f"> {err}\n{traceback.format_exc()}") + exit_code = 1 +exit(exit_code) + +# ====================================================================== diff --git a/proj/weekly-tree/init-h3 b/proj/weekly-tree/init-h3 new file mode 100755 index 00000000..36d521ea --- /dev/null +++ b/proj/weekly-tree/init-h3 @@ -0,0 +1,93 @@ +#! /usr/bin/env python3 +import sys, os, datetime, subprocess +from pathlib import Path + +WEEKLY_TREE_ROOT_DIR = Path("/syn/eu/ac/results/trees") +TODAY = datetime.date.today().strftime("%Y-%m%d") +TODAY_DIR = WEEKLY_TREE_ROOT_DIR.joinpath(TODAY) +SUBTYPES = ["h3"] + +CMD_DIR = Path(sys.argv[0]).parent +EXPORT_CMD = CMD_DIR.joinpath("export") +UPDATE_TREE_CMD = CMD_DIR.joinpath("update-tree") + +# ====================================================================== + +def main(): + previous_dir = find_previous_dir() + print(f">>> previous dir: {previous_dir}", file=sys.stderr) + init_today_dir(previous_dir) + +# ---------------------------------------------------------------------- + +def init_today_dir(previous_dir: Path): + TODAY_DIR.mkdir(exist_ok=True) + for subtype in SUBTYPES: + init_subtype(subtype, previous_dir=previous_dir) + +# ---------------------------------------------------------------------- + +def init_subtype(subtype: str, previous_dir: Path): + subtype_dir = TODAY_DIR.joinpath(subtype) + subtype_dir.mkdir(exist_ok=True) + if not (previous_newick := subtype_dir.joinpath("previous.newick")).exists(): + previous_raxml_best = previous_dir.joinpath(subtype, "raxml-best.txt") + if not previous_raxml_best.exists(): + print(f">> no previous raxml best run report: {previous_raxml_best}", file=sys.stderr) + return + previous_newick_link_target = Path(previous_raxml_best.open().read().strip()) + link = relative_path(previous_newick_link_target, subtype_dir) + print(f">>> {previous_newick} -> {link}", file=sys.stderr) + previous_newick.symlink_to(link) + + if not (source_fas := subtype_dir.joinpath("source.fas")).exists(): + export_source_fas(subtype=subtype, source_fas=source_fas, previous_dir=previous_dir) + + if not (source_newick := subtype_dir.joinpath("source.newick")).exists(): + update_source_newick(subtype=subtype, source_newick=source_newick, previous_dir=previous_dir) + +# ---------------------------------------------------------------------- + +def update_source_newick(subtype: str, source_newick: Path, previous_dir: Path): + print(f">>> updating {source_newick}", file=sys.stderr) + log = subprocess.check_output([UPDATE_TREE_CMD, subtype], cwd=source_newick.parents[1]).decode("utf-8") + source_newick.with_name("update-tree.log").open("w").write(log) + print(log) + +# ---------------------------------------------------------------------- + +def export_source_fas(subtype: str, source_fas: Path, previous_dir: Path): + print(f">>> exporting {source_fas}", file=sys.stderr) + subprocess.check_call([EXPORT_CMD, subtype], cwd=source_fas.parents[1]) + subprocess.check_call(["fasta-size", source_fas, previous_dir.joinpath(subtype, source_fas.name)]) + +# ---------------------------------------------------------------------- + +def relative_path(target: Path, source_dir: Path): + # print(target.parts, source_dir.parts) + for initial_parts in range(min(len(target.parts), len(source_dir.parts))): + # print(initial_parts, target.parts[initial_parts], source_dir.parts[initial_parts], target.parts[initial_parts] == source_dir.parts[initial_parts]) + if target.parts[initial_parts] != source_dir.parts[initial_parts]: + break + # print(initial_parts, target.parts[initial_parts:], source_dir.parts[initial_parts:]) + return Path(*([".."] * (len(source_dir.parts) - initial_parts)), *target.parts[initial_parts:]) + +# ---------------------------------------------------------------------- + +def find_previous_dir(): + dirs = list(WEEKLY_TREE_ROOT_DIR.glob("2*")) + found = None + use = len(dirs) - 1 + while not found and use >= 0: + use_date = dirs[use].stem[:9] + if use_date != TODAY: + found = dirs[use] + use -= 1 + if not found: + dirs_s = "\n ".join(str(dd) for dd in dirs) + raise RuntimeError(f"previous dir not found in\n {dirs_s}") + return found + +# ====================================================================== + +main() diff --git a/proj/weekly-tree/submit b/proj/weekly-tree/submit index 4cb9be3e..3c029e70 100755 --- a/proj/weekly-tree/submit +++ b/proj/weekly-tree/submit @@ -1,3 +1,5 @@ + + #! /usr/bin/env bash set -o errexit -o errtrace -o pipefail -o nounset # -o xtrace diff --git a/proj/weekly-tree/submit-memvar b/proj/weekly-tree/submit-memvar new file mode 100755 index 00000000..1a2c593f --- /dev/null +++ b/proj/weekly-tree/submit-memvar @@ -0,0 +1,18 @@ +#! /usr/bin/env bash +set -o errexit -o errtrace -o pipefail -o nounset # -o xtrace + +submit_tree() +{ + NODE="$1" + SUBTYPE="$2" + ( + ${AE_ROOT}/proj/weekly-tree/raxml --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/raxml-asr --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/garli-memvar --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/pdf ${SUBTYPE} + ) >${SUBTYPE}/submit.log 2>&1 & +} + +# submit_tree i21 h1 +# submit_tree i22 h3 +submit_tree i22 bvic diff --git a/proj/weekly-tree/submit-new b/proj/weekly-tree/submit-new new file mode 100755 index 00000000..69dd730f --- /dev/null +++ b/proj/weekly-tree/submit-new @@ -0,0 +1,18 @@ +#! /usr/bin/env bash +set -o errexit -o errtrace -o pipefail -o nounset # -o xtrace + +submit_tree() +{ + NODE="$1" + SUBTYPE="$2" + ( + ${AE_ROOT}/proj/weekly-tree/raxml --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/raxml-asr --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/garli-h3 --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/pdf ${SUBTYPE} + ) >${SUBTYPE}/submit.log 2>&1 & +} + +submit_tree i20 h1 #i21 +submit_tree i22 h3 +submit_tree i19 bvic From 9c1edcd2578c98c6aa595fcf44a2590653428aae Mon Sep 17 00:00:00 2001 From: SamT123 Date: Fri, 6 Jun 2025 14:02:09 +0100 Subject: [PATCH 03/16] Update make-cmaple Use CMAPLE for initial tree, not raxml --- R/collapse_short_branches.R | 40 ++++++++++ R/rotate.R | 43 +++++++++++ R/set_min_branch_lengths.R | 13 ++++ proj/weekly-tree/make-cmaple | 143 +++++++++++++++++++++++++++++++++++ proj/weekly-tree/pdf | 4 +- proj/weekly-tree/raxml-asr | 18 ++++- proj/weekly-tree/submit | 7 +- 7 files changed, 261 insertions(+), 7 deletions(-) create mode 100644 R/collapse_short_branches.R create mode 100644 R/rotate.R create mode 100755 R/set_min_branch_lengths.R create mode 100755 proj/weekly-tree/make-cmaple diff --git a/R/collapse_short_branches.R b/R/collapse_short_branches.R new file mode 100644 index 00000000..dd01905b --- /dev/null +++ b/R/collapse_short_branches.R @@ -0,0 +1,40 @@ +#! /usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +inphy.txt = args[[1]] +tol = as.numeric(args[[2]]) + +# file names ------------------------------------------------------------- +inphy = readLines(inphy.txt, n = 1) + +outphy = paste0( + paste0( + fs::path_ext_remove(inphy), + "-col." + ), + fs::path_ext(inphy) +) + +outphy.txt = paste0( + paste0( + fs::path_ext_remove(inphy.txt), + "-col." + ), + fs::path_ext(inphy.txt) +) + +# collapse --------------------------------------------------------------- +phy = ape::read.tree(file = inphy) + + +# message(length(phy$edge.length)) +# message(min(phy$edge.length)) +# message(tol) +# message(sum(phy$edge.length < tol)) + +phy_col = ape::di2multi(phy, tol = tol) +message(ape::Nnode(phy), " -> ", ape::Nnode(phy_col)) + +ape::write.tree(phy_col, file = outphy) +writeLines(outphy, outphy.txt) diff --git a/R/rotate.R b/R/rotate.R new file mode 100644 index 00000000..ee4183ab --- /dev/null +++ b/R/rotate.R @@ -0,0 +1,43 @@ +#! /usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +inphy.txt = args[[1]] +infasta = args[[2]] + +# file names +inphy = readLines(inphy.txt, n = 1) + +outphy = paste0( + paste0( + fs::path_ext_remove(inphy), + "-rot." + ), + fs::path_ext(inphy) +) + +outphy.txt = paste0( + paste0( + fs::path_ext_remove(inphy.txt), + "-rot." + ), + fs::path_ext(inphy.txt) +) + +# rotate +fa = ape::read.dna( + file = infasta, + format = "fasta", + as.character = T, + as.matrix = F +) + +outgroup = names(fa)[[1]] + +t = ape::read.tree(file = inphy) + +t = ape::root(t, outgroup = outgroup) +t = ape::ladderize(t, right = F) + +ape::write.tree(t, file = outphy) +writeLines(outphy, outphy.txt) diff --git a/R/set_min_branch_lengths.R b/R/set_min_branch_lengths.R new file mode 100755 index 00000000..2c2d60d7 --- /dev/null +++ b/R/set_min_branch_lengths.R @@ -0,0 +1,13 @@ +#! /usr/bin/env Rscript + +args <- commandArgs(trailingOnly = TRUE) + +infile = args[[1]] +outfile = args[[2]] +min_bl = as.numeric(args[[3]]) + +t = ape::read.tree(file = infile) + +t$edge.length[t$edge.length < min_bl] = min_bl + +ape::write.tree(phy = t, file = outfile) diff --git a/proj/weekly-tree/make-cmaple b/proj/weekly-tree/make-cmaple new file mode 100755 index 00000000..ab74af97 --- /dev/null +++ b/proj/weekly-tree/make-cmaple @@ -0,0 +1,143 @@ +#! /usr/bin/env python3 + +import sys, os, re, argparse, logging, datetime, concurrent.futures, subprocess, random, traceback +from pathlib import Path + +# ====================================================================== + +sSubtypeNorm = {"BV": "BVIC", "BVIC": "BVIC", "BY": "BYAM", "BYAM": "BYAM", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": "h3"} + +sRandomGen = random.SystemRandom() + +sMailTo = "sat65@cam.ac.uk" + +# ---------------------------------------------------------------------- + +def main(args: argparse.Namespace): + argv = '"' + '" "'.join(sys.argv) + '"' + working_dir = Path(args.subdir).resolve() + source_fas = working_dir.joinpath("source.fas") + # source_tree = working_dir.joinpath("source.newick") no source tree, we make from scratch each time + outgroup = source_fas.open().readline().strip()[1:] + try: + best_tree_file = cmaple(working_dir=working_dir, source_fas=source_fas, outgroup=outgroup, num_runs=args.num_runs, slurm_nodelist=args.slurm_nodelist, slurm_exclude_nodes=args.slurm_exclude_nodes) + subprocess.run(["/usr/bin/mail", "-s", f"completed {sys.argv[0]} {working_dir}", sMailTo], input=f"cd {working_dir}\n{argv}\n{best_tree_file}\n", text=True) + logging.info(f"mail to {sMailTo} about completion sent") + return 0 + except Exception as err: + logging.error(f"", exc_info=True) + subprocess.run(["/usr/bin/mail", "-s", f"FAILED {sys.argv[0]} {working_dir}", sMailTo], input=f"cd {working_dir}\n{argv}\n\n{traceback.format_exc()}\n", text=True) + logging.info(f"mail to {sMailTo} about FAILURE sent") + raise + +# ---------------------------------------------------------------------- + +def cmaple(working_dir: Path, source_fas: Path, outgroup: str, slurm_nodelist: str, slurm_exclude_nodes: str, num_runs: int): + + output_dir = get_output_dir(working_dir, "cmaple.") + + cmaple_cmd = [ + "/syn/bin/cmaple", + "-aln", str(source_fas), + "-st", "DNA", + "-m", "GTR", + "--search", "EXHAUSTIVE", + "--min-blength", "0.0001", + "-nt", "1", + "--overwrite" + ] + + # ---------------------------------------------------------------------- + + def prefix(run_id): + return f"{output_dir.joinpath(run_id)}" + + def run_cmaple(run_id): + cmd = (srun_cmd() + + [f"--job-name=vcm_cmaple {working_dir.name} {run_id}", f"--output={output_dir.joinpath(run_id).with_suffix('.stdout')}", f"--error={output_dir.joinpath(run_id).with_suffix('.stderr')}"] + + cmaple_cmd + + ["--prefix", prefix(run_id), "--seed", str(sRandomGen.randint(1, 0xFFFFFFF))]) + with output_dir.joinpath("commands.txt").open("a") as commands_txt: + print(" ".join(cmd), file=commands_txt) + # print(cmd) + subprocess.check_call(cmd) + + def srun_cmd(): + cmd = [ + "srun", + "--cpus-per-task=2", + "--ntasks=1", + "--threads=1", + f"--nodelist={slurm_nodelist}", + ] + if slurm_exclude_nodes: + cmd.append(f"--exclude={slurm_exclude_nodes}") + return cmd + + # ---------------------------------------------------------------------- + + def submit(): + failures = 0 + logging.info(f"submitting {num_runs} jobs") + with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor: + future_to_runid = {executor.submit(run_cmaple, run_id): run_id for run_id in (f"{ri:03d}" for ri in range(num_runs))} + for future in concurrent.futures.as_completed(future_to_runid): + try: + future.result() + except Exception as err: + logging.error(f"cmaple {working_dir} {future_to_runid[future]} FAILED: {err}") + failures += 1 + else: + logging.info(f"cmaple run {working_dir} {future_to_runid[future]} completed") + if failures: + logging.error(f"cmaple {working_dir} FAILED: {failures} runs failed") + else: + logging.info("cmaple completed") + + # ---------------------------------------------------------------------- + + def results_available(): + return len(list(output_dir.glob("*000.treefile"))) > 0 + + tree_file = output_dir.joinpath("000.treefile") + + if not results_available(): + start = datetime.datetime.now() + submit() + logging.info(f"cmaple elapsed: {datetime.datetime.now() - start}") + + output_dir.parent.joinpath("cmaple_tree.txt").open("w").write(str(tree_file)) + + return tree_file + +# ---------------------------------------------------------------------- + +def get_output_dir(working_dir, prefix): + output_dirs = list(working_dir.glob(prefix + "*")) + if output_dirs: + output_dir = max(output_dirs) + else: + output_dir = working_dir.joinpath(prefix + datetime.datetime.now().strftime('%Y-%m%d-%H%M%S')) + output_dir.mkdir() + output_dir.chmod(0o777) + return output_dir + +# ====================================================================== + +try: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("subdir") + # parser.add_argument("outgroup") + parser.add_argument("-n", "--num-runs", dest="num_runs", type=int, default=1) + parser.add_argument("--node", dest="slurm_nodelist", default="i22") + parser.add_argument("--exclude-nodes", dest="slurm_exclude_nodes", default=None) + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG, format="%(levelname)s %(asctime)s: %(message)s") + exit_code = main(args) or 0 +except Exception as err: + logging.error(f"> {err}\n{traceback.format_exc()}") + exit_code = 1 +exit(exit_code) + +# ====================================================================== diff --git a/proj/weekly-tree/pdf b/proj/weekly-tree/pdf index eacaa2f8..b54aee9c 100755 --- a/proj/weekly-tree/pdf +++ b/proj/weekly-tree/pdf @@ -18,7 +18,7 @@ if [[ ${HIDB_SUBTYPE:0:1} == "b" ]]; then HIDB_SUBTYPE=b; fi # ---------------------------------------------------------------------------------------------------- if [[ ! -f ${SUBTYPE}/${SUBTYPE}.tjz ]]; then - ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/garli-best.txt) ${SUBTYPE}/${SUBTYPE}.tjz + ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/cmaple_tree-rot-col.txt) ${SUBTYPE}/${SUBTYPE}.tjz fi if [[ ! -f ${SUBTYPE}/${SUBTYPE}.pdf ]]; then tal -s ${TAL} ${SUBTYPE}/${SUBTYPE}.tjz ${SUBTYPE}/${SUBTYPE}.txt ${SUBTYPE}/${SUBTYPE}.pdf @@ -30,7 +30,7 @@ if [[ ${HIDB_SUBTYPE:0:1} == "h" ]]; then ASR_INFIX=".asr" if [[ ! -f ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz ]]; then ASR_STATES=$(cat ${SUBTYPE}/raxml-asr-best.txt | sed 's/ancestralTree/ancestralStates/') - ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --asr-tree $(cat ${SUBTYPE}/raxml-asr-best.txt) --asr-states ${ASR_STATES} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/garli-best.txt) ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz + ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --asr-tree $(cat ${SUBTYPE}/raxml-asr-best.txt) --asr-states ${ASR_STATES} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/cmaple_tree-rot-col.txt) ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz fi if [[ ! -f ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.pdf ]]; then tal -s ${TAL} -D aa-transitions-method=imported ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.txt ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.pdf diff --git a/proj/weekly-tree/raxml-asr b/proj/weekly-tree/raxml-asr index 8baa0595..a736e54b 100755 --- a/proj/weekly-tree/raxml-asr +++ b/proj/weekly-tree/raxml-asr @@ -11,7 +11,8 @@ def main(args: argparse.Namespace): argv = '"' + '" "'.join(sys.argv) + '"' working_dir = Path(args.subdir).resolve() source_fas = working_dir.joinpath("source.fas") - source_tree = Path(working_dir.joinpath("raxml-best.txt").open().read().strip()).resolve() + source_tree = Path(working_dir.joinpath("cmaple_tree-rot.txt").open().read().strip()).resolve() + if os.fork(): # parent exit(0) @@ -30,13 +31,25 @@ def raxml_ng_asr(working_dir: Path, source_fas: Path, source_tree: Path, slurm_n output_dir = get_output_dir(working_dir, "raxml-ng-asr.") + source_tree_with_min_branch_lengths = source_tree.parent.joinpath("000_minbl.treefile") + bl_script_path = os.environ["AE_ROOT"] + "/R/set_min_branch_lengths.R" + + set_min_bl_cmd = [ + bl_script_path, + str(source_tree.resolve()), + str(source_tree_with_min_branch_lengths.resolve()), + "0.0001" + ] + + # ---------------------------------------------------------------------- + raxml_cmd = [ "/syn/bin/raxml-ng", "--ancestral", "--model", "GTR+G+I", # raxml: -m GTRGAMMAI -c 4 "--msa", str(source_fas), "--msa-format", "FASTA", - "--tree", str(source_tree), + "--tree", str(source_tree_with_min_branch_lengths), "--log", "PROGRESS", # VERBOSE, DEBUG "--threads", "1", ] @@ -53,6 +66,7 @@ def raxml_ng_asr(working_dir: Path, source_fas: Path, source_tree: Path, slurm_n with output_dir.joinpath("commands.txt").open("a") as commands_txt: print(" ".join(cmd), file=commands_txt) # print(cmd) + subprocess.call(set_min_bl_cmd) subprocess.check_call(cmd) def srun_cmd(): diff --git a/proj/weekly-tree/submit b/proj/weekly-tree/submit index 3c029e70..9f1ab027 100755 --- a/proj/weekly-tree/submit +++ b/proj/weekly-tree/submit @@ -8,13 +8,14 @@ submit_tree() NODE="$1" SUBTYPE="$2" ( - ${AE_ROOT}/proj/weekly-tree/raxml --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/make-cmaple --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/R/rotate.R ${SUBTYPE}/cmaple_tree.txt ${SUBTYPE}/source.fas && ${AE_ROOT}/proj/weekly-tree/raxml-asr --node ${NODE} ${SUBTYPE} && - ${AE_ROOT}/proj/weekly-tree/garli --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/R/collapse_short_branches.R ${SUBTYPE}/cmaple_tree-rot.txt 0.00005 && ${AE_ROOT}/proj/weekly-tree/pdf ${SUBTYPE} ) >${SUBTYPE}/submit.log 2>&1 & } submit_tree i20 h1 submit_tree i22 h3 -submit_tree i19 bvic +submit_tree i21 bvic From e1d0b9934235bf5bb44dd46097bf8d67c1a52c05 Mon Sep 17 00:00:00 2001 From: SamT123 Date: Tue, 8 Jul 2025 22:01:01 -0400 Subject: [PATCH 04/16] Increase recursion depth in newick.cc --- cc/tree/newick.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cc/tree/newick.cc b/cc/tree/newick.cc index 85ff59de..dd4f1f3c 100644 --- a/cc/tree/newick.cc +++ b/cc/tree/newick.cc @@ -174,7 +174,7 @@ namespace ae::tree::newick { static constexpr auto whitespace = dsl::ascii::space / dsl::ascii::newline; static constexpr auto rule = dsl::p + dsl::semicolon + dsl::eof; - static constexpr auto max_recursion_depth = 1000; + static constexpr auto max_recursion_depth = 10000; static constexpr auto value = lexy::forward; }; From 70ee6fe45468728882689afc152510a582b63e25 Mon Sep 17 00:00:00 2001 From: SamT123 Date: Wed, 16 Jul 2025 10:31:58 +0100 Subject: [PATCH 05/16] Change proj/weekly-tree/init to refer to cmaple tree, not raxml tree --- proj/weekly-tree/init | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/proj/weekly-tree/init b/proj/weekly-tree/init index 8939b6e1..a8dc0f14 100755 --- a/proj/weekly-tree/init +++ b/proj/weekly-tree/init @@ -31,11 +31,11 @@ def init_subtype(subtype: str, previous_dir: Path): subtype_dir = TODAY_DIR.joinpath(subtype) subtype_dir.mkdir(exist_ok=True) if not (previous_newick := subtype_dir.joinpath("previous.newick")).exists(): - previous_raxml_best = previous_dir.joinpath(subtype, "raxml-best.txt") - if not previous_raxml_best.exists(): - print(f">> no previous raxml best run report: {previous_raxml_best}", file=sys.stderr) + previous_tree = previous_dir.joinpath(subtype, "cmaple_tree.txt") + if not previous_tree.exists(): + print(f">> no previous tree run report: {previous_tree}", file=sys.stderr) return - previous_newick_link_target = Path(previous_raxml_best.open().read().strip()) + previous_newick_link_target = Path(previous_tree.open().read().strip()) link = relative_path(previous_newick_link_target, subtype_dir) print(f">>> {previous_newick} -> {link}", file=sys.stderr) previous_newick.symlink_to(link) From 847b67961c1da07c49fe0b6c02faf1d4c2c8df65 Mon Sep 17 00:00:00 2001 From: SamT123 Date: Wed, 16 Jul 2025 22:43:51 +0100 Subject: [PATCH 06/16] tidy R scripts --- R/collapse_short_branches.R | 90 +++++++++++++++++++++---------------- R/rotate.R | 85 ++++++++++++++++++++++------------- R/set_min_branch_lengths.R | 27 ++++++++--- 3 files changed, 124 insertions(+), 78 deletions(-) diff --git a/R/collapse_short_branches.R b/R/collapse_short_branches.R index dd01905b..833217da 100644 --- a/R/collapse_short_branches.R +++ b/R/collapse_short_branches.R @@ -1,40 +1,52 @@ #! /usr/bin/env Rscript - -args <- commandArgs(trailingOnly = TRUE) - -inphy.txt = args[[1]] -tol = as.numeric(args[[2]]) - -# file names ------------------------------------------------------------- -inphy = readLines(inphy.txt, n = 1) - -outphy = paste0( - paste0( - fs::path_ext_remove(inphy), - "-col." - ), - fs::path_ext(inphy) -) - -outphy.txt = paste0( - paste0( - fs::path_ext_remove(inphy.txt), - "-col." - ), - fs::path_ext(inphy.txt) -) - -# collapse --------------------------------------------------------------- -phy = ape::read.tree(file = inphy) - - -# message(length(phy$edge.length)) -# message(min(phy$edge.length)) -# message(tol) -# message(sum(phy$edge.length < tol)) - -phy_col = ape::di2multi(phy, tol = tol) -message(ape::Nnode(phy), " -> ", ape::Nnode(phy_col)) - -ape::write.tree(phy_col, file = outphy) -writeLines(outphy, outphy.txt) +main = function() { + message("COLLAPSE SHORT BRANCH LENGTHS") + + args <- commandArgs(trailingOnly = TRUE) + + inphy.txt = args[[1]] + tol = as.numeric(args[[2]]) + + # file names ------------------------------------------------------------- + inphy = readLines(inphy.txt, n = 1) + + outphy = paste0( + paste0( + fs::path_ext_remove(inphy), + "-col." + ), + fs::path_ext(inphy) + ) + + outphy.txt = paste0( + paste0( + fs::path_ext_remove(inphy.txt), + "-col." + ), + fs::path_ext(inphy.txt) + ) + + if (fs::file_exists(outphy) & fs::file_exists(outphy.txt)) { + message( + outphy, + " and ", + outphy.txt, + " already exist! Continuing without re-rotating tree." + ) + return(0) + } + + message("Reading tree") + phy = ape::read.tree(file = inphy) + + message("Collapsing branches shorter than ", tol) + phy_col = ape::di2multi(phy, tol = tol) + message(ape::Nnode(phy), " -> ", ape::Nnode(phy_col)) + + message("Writing tree") + ape::write.tree(phy_col, file = outphy) + message("Writing ", outphy.txt) + writeLines(outphy, outphy.txt) +} + +main() diff --git a/R/rotate.R b/R/rotate.R index ee4183ab..2d838e39 100644 --- a/R/rotate.R +++ b/R/rotate.R @@ -1,43 +1,64 @@ #! /usr/bin/env Rscript +main = function() { + message("ROTATE TREE") -args <- commandArgs(trailingOnly = TRUE) + args <- commandArgs(trailingOnly = TRUE) -inphy.txt = args[[1]] -infasta = args[[2]] + inphy.txt = args[[1]] + infasta = args[[2]] -# file names -inphy = readLines(inphy.txt, n = 1) + # file names + inphy = readLines(inphy.txt, n = 1) -outphy = paste0( - paste0( - fs::path_ext_remove(inphy), - "-rot." - ), - fs::path_ext(inphy) -) + outphy = paste0( + paste0( + fs::path_ext_remove(inphy), + "-rot." + ), + fs::path_ext(inphy) + ) -outphy.txt = paste0( - paste0( - fs::path_ext_remove(inphy.txt), - "-rot." - ), - fs::path_ext(inphy.txt) -) + outphy.txt = paste0( + paste0( + fs::path_ext_remove(inphy.txt), + "-rot." + ), + fs::path_ext(inphy.txt) + ) -# rotate -fa = ape::read.dna( - file = infasta, - format = "fasta", - as.character = T, - as.matrix = F -) + if (fs::file_exists(outphy) & fs::file_exists(outphy.txt)) { + message( + outphy, + " and ", + outphy.txt, + " already exist! Continuing without re-rotating tree." + ) + return(0) + } -outgroup = names(fa)[[1]] + message("Reading fasta") + fa = ape::read.dna( + file = infasta, + format = "fasta", + as.character = T, + as.matrix = F + ) -t = ape::read.tree(file = inphy) + outgroup = names(fa)[[1]] -t = ape::root(t, outgroup = outgroup) -t = ape::ladderize(t, right = F) + message("Reading ", inphy) + t = ape::read.tree(file = inphy) -ape::write.tree(t, file = outphy) -writeLines(outphy, outphy.txt) + message("Rooting tree at ", outgroup) + t = ape::root(t, outgroup = outgroup) + message("Ladderizing tree") + t = ape::ladderize(t, right = F) + + message("Writing tree") + ape::write.tree(t, file = outphy) + message("Writing ", outphy.txt) + writeLines(outphy, outphy.txt) + return(0) +} + +main() diff --git a/R/set_min_branch_lengths.R b/R/set_min_branch_lengths.R index 2c2d60d7..04493502 100755 --- a/R/set_min_branch_lengths.R +++ b/R/set_min_branch_lengths.R @@ -1,13 +1,26 @@ #! /usr/bin/env Rscript -args <- commandArgs(trailingOnly = TRUE) +main = function() { + message("SET MINIMUM BRANCH LENGTHS") -infile = args[[1]] -outfile = args[[2]] -min_bl = as.numeric(args[[3]]) + args <- commandArgs(trailingOnly = TRUE) -t = ape::read.tree(file = infile) + infile = args[[1]] + outfile = args[[2]] + min_bl = as.numeric(args[[3]]) -t$edge.length[t$edge.length < min_bl] = min_bl + if (fs::file_exists(outfile)) { + message(outfile, " already exists. Not remaking.") + } -ape::write.tree(phy = t, file = outfile) + message("Reading tree") + t = ape::read.tree(file = infile) + + message("Setting minimum branch lengths to ", min_bl) + t$edge.length[t$edge.length < min_bl] = min_bl + + message("Writing tree") + ape::write.tree(phy = t, file = outfile) +} + +main() From 69e2f2423af80fc24c5a9d34b11e1333012d51ce Mon Sep 17 00:00:00 2001 From: SamT123 Date: Thu, 17 Jul 2025 09:30:28 +0100 Subject: [PATCH 07/16] update email and message --- R/collapse_short_branches.R | 2 +- proj/weekly-tree/make-cmaple | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/collapse_short_branches.R b/R/collapse_short_branches.R index 833217da..21354a5a 100644 --- a/R/collapse_short_branches.R +++ b/R/collapse_short_branches.R @@ -31,7 +31,7 @@ main = function() { outphy, " and ", outphy.txt, - " already exist! Continuing without re-rotating tree." + " already exist! Continuing without re-collapsing branches" ) return(0) } diff --git a/proj/weekly-tree/make-cmaple b/proj/weekly-tree/make-cmaple index ab74af97..3509efe9 100755 --- a/proj/weekly-tree/make-cmaple +++ b/proj/weekly-tree/make-cmaple @@ -10,7 +10,7 @@ sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": sRandomGen = random.SystemRandom() -sMailTo = "sat65@cam.ac.uk" +sMailTo = "weekly-tree@antigenic-cartography.org" # ---------------------------------------------------------------------- From cbfde28154b174f81f206fad460e839a02744491 Mon Sep 17 00:00:00 2001 From: SamT123 Date: Fri, 1 Aug 2025 11:50:44 +0100 Subject: [PATCH 08/16] Add some sequence exclusions by Poppy --- proj/weekly-tree/export | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/proj/weekly-tree/export b/proj/weekly-tree/export index e4261c5f..9dcabea4 100755 --- a/proj/weekly-tree/export +++ b/proj/weekly-tree/export @@ -101,6 +101,10 @@ def bvic(subtype: str, args: argparse.Namespace): # 2025-0625 no exclusions # 2025-0702 "PHUKET/P1187/2025_5A09E7D5", + # 2025-0716 (omitted exclusions in error,later added in 0723) + # 2025-0723 + "ODISHA/1421/2023_6422458A", + # 2025-0729 no exclusions ]) .include_name(outgroup) # outgroup BRISBANE/60/2008, B/CHONGQING BANAN/1840/2017 (V1A) .remove_hash_duplicates() @@ -339,6 +343,19 @@ def h1(subtype: str, args: argparse.Namespace): "VERMONT/UVM-0478/2022_OR_DFD37A87", "VERMONT/UVM-1652/2022_OR_42E6DF65", "FRANCE/NOR-IPP04189/2025_OR_88070E22", # caused tree failure + # 2025-0716 (omitted exclusions in error, added 29/07, some added 0723 PR) + "CAMBODIA/SVH250268/2025_OR_238D9DD5", + "CAMBODIA/IAHC250126/2025_OR_FDAC2F6D", + "CAMBODIA/IKCM250040/2025_OR_7BD02395", + "CAMBODIA/CCH250037/2025_OR_347A5F98", + # 2025-0723 + "NIZHNIY_NOVGOROD/RII-MH239577S/2025_OR_B6AF35B5", + "CAMBODIA/KVH250142/2025_OR_3F8690DC", + "CAMBODIA/IAHC250133/2025_OR_C34D3AC3", + "SURGUT/RII-MH237071S/2025_OR_3A47CC28", + "MOROCCO/1168/2025_OR_910D767C", + # 2025-0729 + "ARKHANGELSK/RII-MH246940S/2025_OR_78BF9CC7", ]) .remove_hash_duplicates() .replace_with_master() @@ -597,6 +614,32 @@ def h3(subtype: str, args: argparse.Namespace): "COTE_DIVOIRE/4331/2024_OR_E0FF092D", # 2025-1202 "WISCONSIN/67/2005_XXMDCK2_8C52B46C", + # 2025-0716 (omitted exclusions in error, added 29/07 PR) + "INDIA/BHO_NIV25-827/2025_MDCK3_C54368E2", + "CAMBODIA/IAHC240402/2024_OR_35F1131A", + "INDIA/PUN_NIVCOV2572/2025_MDCK1_C9AF818D", + "ASTURIAS/232524600/2025_A4329A8F", + "OREL/SWINE-RII-WD2652S/2025_OR_57A8B37C", + "OREL/SWINE-RII-WD2687S/2025_OR_C384E32B", + "MEXICO/2496813201/2025_OR_E2853515", + # 2025-0723 + "NORWAY/6870/2025_OR_828769BE", + "BRITISH_COLUMBIA/RV00479/2025_SIAT2/SIAT1_0DA227AA", + "QUEBEC/240/2025_OR_8D4D59A8", + "GALICIA/52520233/2025_F71B58B8", + "BULGARIA/2506/2025_SIAT2/SIAT1_8B0BEF2D", + "BULGARIA/1941/2025_SIAT2/SIAT1_0D5E6F90", + "CAMBODIA/IBTB250007/2025_OR_4E893E81", + "ASTURIAS/232525925/2025_719B9A6C", + "BAVARIA/6511057/2025_OR_D759C9B5", + "MEXICO/2496813201/2025_OR_E2853515", + "CAMBODIA/SVH250074/2025_OR_2253D125", + "ALBANIA/310521/2025_SIAT1_5DCA4F3D", + "CAMBODIA/IAHC250174/2025_OR_6BFB38A8", + "FRANCE/ARA-HCL023004739801/2023_OR_003FC67E", + "SOUTH_AFRICA/NHLS-UCT-CERI-C067966/2025_EE268891", + "PERU/LIM-UPCH-4915/2025_1E953D52", + # 2025-0729 no exclusions ]) .remove_hash_duplicates() .replace_with_master() From 1357244b922f7d99b937ffdb7019294b3d84e0cf Mon Sep 17 00:00:00 2001 From: SamT123 Date: Fri, 1 Aug 2025 14:57:24 +0100 Subject: [PATCH 09/16] Add trimming script --- R/collapse_short_branches.R | 7 ++ R/rotate.R | 7 ++ R/set_min_branch_lengths.R | 7 ++ R/tree_trim.R | 166 ++++++++++++++++++++++++++++++++++++ proj/weekly-tree/pdf | 4 +- proj/weekly-tree/raxml-asr | 4 +- proj/weekly-tree/submit | 3 +- 7 files changed, 193 insertions(+), 5 deletions(-) create mode 100644 R/tree_trim.R diff --git a/R/collapse_short_branches.R b/R/collapse_short_branches.R index 21354a5a..e1f868e4 100644 --- a/R/collapse_short_branches.R +++ b/R/collapse_short_branches.R @@ -2,6 +2,13 @@ main = function() { message("COLLAPSE SHORT BRANCH LENGTHS") + .libPaths( + c( + "/home/samt123/R/x86_64-pc-linux-gnu-library/4.2", + .libPaths() + ) + ) + args <- commandArgs(trailingOnly = TRUE) inphy.txt = args[[1]] diff --git a/R/rotate.R b/R/rotate.R index 2d838e39..e2d7d865 100644 --- a/R/rotate.R +++ b/R/rotate.R @@ -2,6 +2,13 @@ main = function() { message("ROTATE TREE") + .libPaths( + c( + "/home/samt123/R/x86_64-pc-linux-gnu-library/4.2", + .libPaths() + ) + ) + args <- commandArgs(trailingOnly = TRUE) inphy.txt = args[[1]] diff --git a/R/set_min_branch_lengths.R b/R/set_min_branch_lengths.R index 04493502..764afaac 100755 --- a/R/set_min_branch_lengths.R +++ b/R/set_min_branch_lengths.R @@ -3,6 +3,13 @@ main = function() { message("SET MINIMUM BRANCH LENGTHS") + .libPaths( + c( + "/home/samt123/R/x86_64-pc-linux-gnu-library/4.2", + .libPaths() + ) + ) + args <- commandArgs(trailingOnly = TRUE) infile = args[[1]] diff --git a/R/tree_trim.R b/R/tree_trim.R new file mode 100644 index 00000000..7feab6cd --- /dev/null +++ b/R/tree_trim.R @@ -0,0 +1,166 @@ +#! /usr/bin/env Rscript + +main = function() { + message("TRIM TREE") + + .libPaths( + c( + "/home/samt123/R/x86_64-pc-linux-gnu-library/4.2", + .libPaths() + ) + ) + + library(seqUtils, quietly = T, warn.conflicts = F, verbose = F) + library(convergence, quietly = T, warn.conflicts = F, verbose = F) + + library(treeio, quietly = T, warn.conflicts = F, verbose = F, ) + + library(purrr, quietly = T, warn.conflicts = F, verbose = F) + library(dplyr, quietly = T, warn.conflicts = F, verbose = F) + library(stringr, quietly = T, warn.conflicts = F, verbose = F) + + convergence:::add_to_PATH("/home/sat65/miniforge3/envs/treebuild/bin") # usher + + # args ------------------------------------------------------------ + args <- commandArgs(trailingOnly = TRUE) + + subtype = args[[1]] + inphy.txt = args[[2]] + infasta = args[[3]] + + inphy = readLines(inphy.txt, n = 1) + + outphy = paste0( + paste0( + fs::path_ext_remove(inphy), + "-trim." + ), + fs::path_ext(inphy) + ) + + outphy.txt = paste0( + paste0( + fs::path_ext_remove(inphy.txt), + "-trim." + ), + fs::path_ext(inphy.txt) + ) + + outfasta = paste0( + paste0( + fs::path_ext_remove(infasta), + "-trim." + ), + fs::path_ext(infasta) + ) + + # specify trim ---------------------------------------------------- + if (subtype == "h3") { + trim = T + strain = "COTE_DIVOIRE/1904/2022" + mutation = "223V" + min_tips = 15000 + } else if (subtype == "h1") { + trim = F + strain = NA_character_ + mutation = NA_character_ + min_tips = 0 + } else if (subtype == "bvic") { + trim = F + strain = NA_character_ + mutation = NA_character_ + min_tips = 0 + } else if (subtype == "byam") { + trim = F + strain = NA_character_ + mutation = NA_character_ + min_tips = 0 + } else { + stop("Invalid subtype ", subtype) + } + + # read input files ------------------------------------------------------------ + + phy = castor::read_tree(file = inphy) + fa = seqUtils::fast_fasta(infasta) + + # trim ------------------------------------------------------------ + if (!trim) { + trimmed_phy = phy + } else { + tree_and_sequences = list( + tree = phy, + sequences = tibble::tibble( + Isolate_unique_identifier = names(fa), + dna_sequence = seqUtils::clean_sequences(unname(fa), type = "nt") + ) + ) + + usher_tree_and_sequences = convergence::addASRusher( + tree_and_sequences, + nuc_ref = seqUtils::alaska_232_2015_nts, + aa_ref = seqUtils::alaska_232_2015_aas + ) + + usher_tree_and_sequences$tree_tibble$nd = c( + rep(1, ape::Ntip(usher_tree_and_sequences$tree)), + castor::count_tips_per_node(usher_tree_and_sequences$tree) + ) + + strain_fullname = usher_tree_and_sequences$tree$tip.label[ + stringr::str_detect( + usher_tree_and_sequences$tree$tip.label, + stringr::fixed(strain) + ) + ] + + parents = rev(convergence:::getParents( + usher_tree_and_sequences$tree_tibble, + which(usher_tree_and_sequences$tree_tibble$label == strain_fullname) + )) + + parents_mutations = usher_tree_and_sequences$tree_tibble$aa_mutations[ + parents + ] + + ancestor = parents[ + purrr::map_lgl( + parents_mutations, + ~ any(stringr::str_detect(.x, stringr::fixed(mutation))) + ) + ] + + trimmed_phy = castor::get_subtree_at_node( + usher_tree_and_sequences$tree, + ancestor - ape::Ntip(usher_tree_and_sequences$tree) + )$subtree + + if (ape::Ntip(trimmed_phy) < min_tips) { + stop( + "Fewer than `min_tips (= ", + min_tips, + ") tips in trimmed phylogeny. Perhaps the wrogn branch was found?" + ) + } + + trimmed_phy = ape::ladderize(trimmed_phy, right = F) + trimmed_phy$edge.length = trimmed_phy$edge.length / mean(nchar(fa)) # per nt + trimmed_seqs = fa[names(fa) %in% trimmed_phy$tip.label] + + message("Writing tree") + ape::write.tree(trimmed_phy, file = outphy) + + message("Writing ", outphy.txt) + writeLines(outphy, outphy.txt) + + message("Writing ", outfasta) + seqUtils::write_fast_fasta( + seqs = unname(trimmed_seqs), + names = names(trimmed_seqs), + path = outfasta + ) + return(0) + } +} + +main() diff --git a/proj/weekly-tree/pdf b/proj/weekly-tree/pdf index b54aee9c..f8d1fa5d 100755 --- a/proj/weekly-tree/pdf +++ b/proj/weekly-tree/pdf @@ -18,7 +18,7 @@ if [[ ${HIDB_SUBTYPE:0:1} == "b" ]]; then HIDB_SUBTYPE=b; fi # ---------------------------------------------------------------------------------------------------- if [[ ! -f ${SUBTYPE}/${SUBTYPE}.tjz ]]; then - ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/cmaple_tree-rot-col.txt) ${SUBTYPE}/${SUBTYPE}.tjz + ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/cmaple_tree-rot-trim-col.txt) ${SUBTYPE}/${SUBTYPE}.tjz fi if [[ ! -f ${SUBTYPE}/${SUBTYPE}.pdf ]]; then tal -s ${TAL} ${SUBTYPE}/${SUBTYPE}.tjz ${SUBTYPE}/${SUBTYPE}.txt ${SUBTYPE}/${SUBTYPE}.pdf @@ -30,7 +30,7 @@ if [[ ${HIDB_SUBTYPE:0:1} == "h" ]]; then ASR_INFIX=".asr" if [[ ! -f ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz ]]; then ASR_STATES=$(cat ${SUBTYPE}/raxml-asr-best.txt | sed 's/ancestralTree/ancestralStates/') - ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --asr-tree $(cat ${SUBTYPE}/raxml-asr-best.txt) --asr-states ${ASR_STATES} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/cmaple_tree-rot-col.txt) ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz + ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --asr-tree $(cat ${SUBTYPE}/raxml-asr-best.txt) --asr-states ${ASR_STATES} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/cmaple_tree-rot-trim-col.txt) ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz fi if [[ ! -f ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.pdf ]]; then tal -s ${TAL} -D aa-transitions-method=imported ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.txt ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.pdf diff --git a/proj/weekly-tree/raxml-asr b/proj/weekly-tree/raxml-asr index a736e54b..6dea16c5 100755 --- a/proj/weekly-tree/raxml-asr +++ b/proj/weekly-tree/raxml-asr @@ -10,8 +10,8 @@ sRandomGen = random.SystemRandom() def main(args: argparse.Namespace): argv = '"' + '" "'.join(sys.argv) + '"' working_dir = Path(args.subdir).resolve() - source_fas = working_dir.joinpath("source.fas") - source_tree = Path(working_dir.joinpath("cmaple_tree-rot.txt").open().read().strip()).resolve() + source_fas = working_dir.joinpath("source-trim.fas") + source_tree = Path(working_dir.joinpath("cmaple_tree-rot-trim.txt").open().read().strip()).resolve() if os.fork(): # parent diff --git a/proj/weekly-tree/submit b/proj/weekly-tree/submit index 9f1ab027..caa4e764 100755 --- a/proj/weekly-tree/submit +++ b/proj/weekly-tree/submit @@ -10,8 +10,9 @@ submit_tree() ( ${AE_ROOT}/proj/weekly-tree/make-cmaple --node ${NODE} ${SUBTYPE} && ${AE_ROOT}/R/rotate.R ${SUBTYPE}/cmaple_tree.txt ${SUBTYPE}/source.fas && + ${AE_ROOT}/R/tree_trim.R ${SUBTYPE} ${SUBTYPE}/cmaple_tree-rot.txt ${SUBTYPE}/source.fas && ${AE_ROOT}/proj/weekly-tree/raxml-asr --node ${NODE} ${SUBTYPE} && - ${AE_ROOT}/R/collapse_short_branches.R ${SUBTYPE}/cmaple_tree-rot.txt 0.00005 && + ${AE_ROOT}/R/collapse_short_branches.R ${SUBTYPE}/cmaple_tree-rot-trim.txt 0.00005 && ${AE_ROOT}/proj/weekly-tree/pdf ${SUBTYPE} ) >${SUBTYPE}/submit.log 2>&1 & } From 6ebbd113573f920c0bd18f568c1f05da2ae2e8a6 Mon Sep 17 00:00:00 2001 From: SamT123 Date: Sat, 2 Aug 2025 23:01:47 +0100 Subject: [PATCH 10/16] fix trim-tree to save phy and seqs when no trimming is performed --- R/tree_trim.R | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/R/tree_trim.R b/R/tree_trim.R index 7feab6cd..64c1c026 100644 --- a/R/tree_trim.R +++ b/R/tree_trim.R @@ -87,6 +87,7 @@ main = function() { # trim ------------------------------------------------------------ if (!trim) { trimmed_phy = phy + trimmed_seqs = fa } else { tree_and_sequences = list( tree = phy, @@ -146,21 +147,21 @@ main = function() { trimmed_phy = ape::ladderize(trimmed_phy, right = F) trimmed_phy$edge.length = trimmed_phy$edge.length / mean(nchar(fa)) # per nt trimmed_seqs = fa[names(fa) %in% trimmed_phy$tip.label] + } - message("Writing tree") - ape::write.tree(trimmed_phy, file = outphy) + message("Writing tree") + ape::write.tree(trimmed_phy, file = outphy) - message("Writing ", outphy.txt) - writeLines(outphy, outphy.txt) + message("Writing ", outphy.txt) + writeLines(outphy, outphy.txt) - message("Writing ", outfasta) - seqUtils::write_fast_fasta( - seqs = unname(trimmed_seqs), - names = names(trimmed_seqs), - path = outfasta - ) - return(0) - } + message("Writing ", outfasta) + seqUtils::write_fast_fasta( + seqs = unname(trimmed_seqs), + names = names(trimmed_seqs), + path = outfasta + ) + return(0) } main() From cb3a8719acb4a637e2514b306b9f21f98047bd88 Mon Sep 17 00:00:00 2001 From: SamT123 Date: Wed, 10 Sep 2025 14:49:46 +0100 Subject: [PATCH 11/16] Add bvic trim specification to R/tree_trim.R --- R/tree_trim.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/tree_trim.R b/R/tree_trim.R index 64c1c026..834fed49 100644 --- a/R/tree_trim.R +++ b/R/tree_trim.R @@ -66,10 +66,10 @@ main = function() { mutation = NA_character_ min_tips = 0 } else if (subtype == "bvic") { - trim = F - strain = NA_character_ - mutation = NA_character_ - min_tips = 0 + trim = T + strain = "OMAN/2651/2019" + mutation = "127T" + min_tips = 15000 } else if (subtype == "byam") { trim = F strain = NA_character_ @@ -81,7 +81,7 @@ main = function() { # read input files ------------------------------------------------------------ - phy = castor::read_tree(file = inphy) + phy = ape::read.tree(file = inphy) fa = seqUtils::fast_fasta(infasta) # trim ------------------------------------------------------------ @@ -99,8 +99,8 @@ main = function() { usher_tree_and_sequences = convergence::addASRusher( tree_and_sequences, - nuc_ref = seqUtils::alaska_232_2015_nts, - aa_ref = seqUtils::alaska_232_2015_aas + nuc_ref = fa[1], + aa_ref = as.character(Biostrings::translate(Biostrings::DNAString(fa[1]))) ) usher_tree_and_sequences$tree_tibble$nd = c( From 6ab2b23af98701c123d84358ee4ddff0746b6033 Mon Sep 17 00:00:00 2001 From: SamT123 Date: Wed, 10 Sep 2025 14:50:24 +0100 Subject: [PATCH 12/16] Run bvic on i18 --- proj/weekly-tree/submit | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proj/weekly-tree/submit b/proj/weekly-tree/submit index caa4e764..4c5fd516 100755 --- a/proj/weekly-tree/submit +++ b/proj/weekly-tree/submit @@ -19,4 +19,4 @@ submit_tree() submit_tree i20 h1 submit_tree i22 h3 -submit_tree i21 bvic +submit_tree i18 bvic From 78c4fdc22371d7771fc107be1369f80a32a97980 Mon Sep 17 00:00:00 2001 From: SamT123 Date: Wed, 10 Sep 2025 16:47:42 +0100 Subject: [PATCH 13/16] Skip R/tree_trim.R if trimmed tree already exists --- R/tree_trim.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/tree_trim.R b/R/tree_trim.R index 834fed49..3825a94e 100644 --- a/R/tree_trim.R +++ b/R/tree_trim.R @@ -54,6 +54,10 @@ main = function() { fs::path_ext(infasta) ) + if (all(fs::file_exists(outphy, outphy.txt, outfasta))) { + message("All output files already exist. Not remaking.") + } + # specify trim ---------------------------------------------------- if (subtype == "h3") { trim = T From 8fa85065d6584777916dad7d5e03604434fda0f0 Mon Sep 17 00:00:00 2001 From: SamT123 Date: Thu, 11 Sep 2025 12:38:41 +0100 Subject: [PATCH 14/16] Small bugfix in R/tree_trim.R --- R/tree_trim.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/tree_trim.R b/R/tree_trim.R index 3825a94e..c389fd90 100644 --- a/R/tree_trim.R +++ b/R/tree_trim.R @@ -54,7 +54,7 @@ main = function() { fs::path_ext(infasta) ) - if (all(fs::file_exists(outphy, outphy.txt, outfasta))) { + if (all(fs::file_exists(c(outphy, outphy.txt, outfasta)))) { message("All output files already exist. Not remaking.") } From 7410b56921b9ecf76517e00b243ec1e4ccc7ec08 Mon Sep 17 00:00:00 2001 From: SamT123 Date: Thu, 11 Sep 2025 13:01:14 +0100 Subject: [PATCH 15/16] Change tree trimming method previously: first ancestor of sequence X with occurence of mutation Y. But this had the problem of the trim being bad if that one sequence is placed outside of the clade we are aiming for (well, it fails because the trimmed tree contains too few sequences) Now: find all nodes with occurrence of mutations, which do have mutations in 'required_mutations' and do not have mutations in 'forbidden mutations. Then check there is only one such clade with > min_tips descendants, and trim to that clade. --- R/tree_trim.R | 66 +++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/R/tree_trim.R b/R/tree_trim.R index c389fd90..723760fe 100644 --- a/R/tree_trim.R +++ b/R/tree_trim.R @@ -61,23 +61,27 @@ main = function() { # specify trim ---------------------------------------------------- if (subtype == "h3") { trim = T - strain = "COTE_DIVOIRE/1904/2022" mutation = "223V" + required_mutations = list() # list(list(at = 100, to = "T"), list(at = ...)) + forbidden_mutations = list() min_tips = 15000 } else if (subtype == "h1") { trim = F - strain = NA_character_ - mutation = NA_character_ + mutation = NA + required_mutations = list() + forbidden_mutations = list() min_tips = 0 } else if (subtype == "bvic") { trim = T - strain = "OMAN/2651/2019" mutation = "127T" + required_mutations = list() + forbidden_mutations = list() min_tips = 15000 } else if (subtype == "byam") { trim = F - strain = NA_character_ - mutation = NA_character_ + mutation = NA + required_mutations = list() + forbidden_mutations = list() min_tips = 0 } else { stop("Invalid subtype ", subtype) @@ -112,42 +116,38 @@ main = function() { castor::count_tips_per_node(usher_tree_and_sequences$tree) ) - strain_fullname = usher_tree_and_sequences$tree$tip.label[ - stringr::str_detect( - usher_tree_and_sequences$tree$tip.label, - stringr::fixed(strain) - ) - ] + tree_tibble_mutation_occs = usher_tree_and_sequences$tree_tibble %>% + filter( + map_lgl(aa_mutations_nonsyn, ~mutation %in% str_sub(.x, 2)), + nd >= min_tips + ) %>% + arrange(-nd) + + for (r_m in required_mutations){ + at = as.integer(r_m[["at"]]) + to = r_m[["to"]] + tree_tibble_mutation_occs = tree_tibble_mutation_occs %>% + filter(substr(reconstructed_aa_sequence, at, at) == to) + } - parents = rev(convergence:::getParents( - usher_tree_and_sequences$tree_tibble, - which(usher_tree_and_sequences$tree_tibble$label == strain_fullname) - )) + for (f_m in required_mutations){ + at = as.integer(f_m[["at"]]) + to = f_m[["to"]] + tree_tibble_mutation_occs = tree_tibble_mutation_occs %>% + filter(substr(reconstructed_aa_sequence, at, at) != to) + } - parents_mutations = usher_tree_and_sequences$tree_tibble$aa_mutations[ - parents - ] + if (nrow(tree_tibble_mutation_occs) != 1){ + stop("No tree branches fulfilling specification found ") + } - ancestor = parents[ - purrr::map_lgl( - parents_mutations, - ~ any(stringr::str_detect(.x, stringr::fixed(mutation))) - ) - ] + ancestor = tree_tibble_mutation_occs$node[[1]] trimmed_phy = castor::get_subtree_at_node( usher_tree_and_sequences$tree, ancestor - ape::Ntip(usher_tree_and_sequences$tree) )$subtree - if (ape::Ntip(trimmed_phy) < min_tips) { - stop( - "Fewer than `min_tips (= ", - min_tips, - ") tips in trimmed phylogeny. Perhaps the wrogn branch was found?" - ) - } - trimmed_phy = ape::ladderize(trimmed_phy, right = F) trimmed_phy$edge.length = trimmed_phy$edge.length / mean(nchar(fa)) # per nt trimmed_seqs = fa[names(fa) %in% trimmed_phy$tip.label] From 537dd94c61c52b73f90745868f8f31c3e785ba13 Mon Sep 17 00:00:00 2001 From: Sarah James Date: Thu, 11 Dec 2025 16:33:52 +0000 Subject: [PATCH 16/16] Add sequence exclusions --- proj/weekly-tree/export | 1337 +++++++++++++++++++++++---------------- 1 file changed, 777 insertions(+), 560 deletions(-) diff --git a/proj/weekly-tree/export b/proj/weekly-tree/export index 9dcabea4..bc61c172 100755 --- a/proj/weekly-tree/export +++ b/proj/weekly-tree/export @@ -2,23 +2,50 @@ import sys, os, argparse, traceback from pathlib import Path + sys.path[:0] = [str(Path(os.environ["AE_ROOT"], dir)) for dir in ["build", "py"]] import ae_backend from ae.sequences.source import fasta # ====================================================================== -sSubtypeForSeqdb = {"B": "B", "BVIC": "B", "BYAM": "B", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} -sSubtypeNorm = {"BV": "BVIC", "BVIC": "BVIC", "BY": "BYAM", "BYAM": "BYAM", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubtypeForSeqdb = { + "B": "B", + "BVIC": "B", + "BYAM": "B", + "A(H1N1)": "A(H1N1)", + "H1": "A(H1N1)", + "1": "A(H1N1)", + "A(H3N2)": "A(H3N2)", + "H3": "A(H3N2)", + "3": "A(H3N2)", +} +sSubtypeNorm = { + "BV": "BVIC", + "BVIC": "BVIC", + "BY": "BYAM", + "BYAM": "BYAM", + "A(H1N1)": "A(H1N1)", + "H1": "A(H1N1)", + "1": "A(H1N1)", + "A(H3N2)": "A(H3N2)", + "H3": "A(H3N2)", + "3": "A(H3N2)", +} sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": "h3"} + def main(args: argparse.Namespace): - subtypes = [sSubtypeNorm[st] for st in (args.subtype or ["BVIC", "BYAM", "H1", "H3"])] + subtypes = [ + sSubtypeNorm[st] for st in (args.subtype or ["BVIC", "BYAM", "H1", "H3"]) + ] for subtype in subtypes: sSubtypeToFunc[subtype](subtype=subtype, args=args) + # ---------------------------------------------------------------------- + def bvic(subtype: str, args: argparse.Namespace): outgroup = "VICTORIA/830/2013" selected = ( @@ -28,85 +55,116 @@ def bvic(subtype: str, args: argparse.Namespace): .lineage("VICTORIA") .exclude_with_issue() .filter_dates(first="2016-06") - .find_masters() # to be able to use .filter below - .exclude(lambda ref: ref.aa["165-"]) # exclude 6-del and MIE/1/2019 with garbage in the middle - .exclude_name([ - "ANHUI_JINGHU/63/2019", # garbage in 406-414 leading to a long branch (2021-0722) - "LISBOA/NISU182_17-18/2018", # yamagata, long branch 2021-1117 - # "ZAMBIA/503/2021", # potential long branch 2022-0401 - # 2023-01-28 potential long branch - "KRABI/FS002/2022", - "CATALONIA/NSVH151199301/2023_OR_0ACCE47F", # XXXX 2023-0421 - "SOUTH_AFRICA/R10785/2022_OR_0EB8BBEA", # XXX 2023-0515 - "SOUTH_AFRICA/R10907/2022_OR_BA5B358C", - "SOUTH_AFRICA/R09298/2022_OR_1A5C630A", - "SOUTH_AFRICA/R10914/2022_OR_ADA6D682", - "SOUTH_AFRICA/R10638/2022_OR_5EAB4CBB", - "SOUTH_AFRICA/R11310/2022_OR_12125C84", - "SOUTH_AFRICA/R11575/2022_OR_FED84F4F", - "SOUTH_AFRICA/R08092/2022_OR_93DDF2B2", - # XXX 2023-0602 - "LA_REUNION/HCL023041635201/2022_103E7754", - "ROMANIA/550950/2023_MDCK-SIAT_E0DECCD2", - "SAINT_PETERSBURG/RII-10866S/2023_OR_477275F7", - # XXX2023-0629 - "ABU_DHABI/4030/2022_AEC2DE48", - "RIO_GRANDE_DO_SUL/28773/2022_455F4FFF", - # 2023-0731 - "MALAYSIA/IMR-SARI0563/2023_25FE3E0E", - #2023-0811 Yam - "KENYA/KCH157321/2019_OR_308AC34A", - # 2023-0822 XXX - "ALAGOAS/5742/2023_OR_441065AD", - "ALAGOAS/4659/2023_OR_558AC3D6", - "PARANA/5012/2023_OR_D54659DC", - # 2023-1005 XXXX - "UNITED_KINGDOM/UO-476/2023", - # 2024-0130 - "SANTA_CATARINA/10748/2023_OR_7B192215", - # 2024-0703 - "MADRID/HULP-200063972/2024_C98ADFD1", # unknown at deletion positions 162-164 - "MADRID/HULP-200041842/2024_5D4B7141", # unknown at deletion positions 162-164 - # 2024-0718 - "AUSTRIA/MUW1681066/2024_OR_3D9C21FB", # XXXX - "ATHENS/ILI411/2024_OR_556F1400", # XXXX - # 2024-0411 - "PAIS_VASCO/1919/2024_OR_1D682639", - "GALICIA/52314290/2024_AF2005E4", - # 2025-2411 - "MADAGASCAR/3422/2024_OR_DF3D589F", - "MEXICO/2429277701/2024_OR_3BF2F243", - "GALICIA/GA-CHUAC-250/2025_OR_AC92C5F3", - # 2025-1202 - "PAKISTAN/NIC-JHL/2010_OR_EC56A7FD", - "PAKISTAN/NIC-ICT-284/2025_OR_8DAE4B6A", - "PAKISTAN/NIC-ICT-218/2025_OR_362FAB6C", - "PAKISTAN/NMC-12/2025_OR_7B217EEC", - "PAKISTAN/NIC-ICT-190/2025_OR_D4D6D601", - "ANDALUCIA/PMC-00528/2025_OR_549D6405", - # 2025-0313 - "GALICIA/52463655/2025_0971E6C6", - # 2025-0507 - "THUERINGEN/31/2025_OR_758A40BD", - "FRANCE/ARA-HCL025053223301/2025_OR_1C544F25", - "ABAY/NRL-2330/2025_C2E364F7", - "RIO_DE_JANEIRO/533/2025_OR_C04A6642", - "LA_REUNION/HCL023041635201/2022_OR_103E7754", - "ASTURIAS/232465739/2024_EA8BEFAB", - "UKRAINE/3471/2025_6E494813", - # 2025-0521 - "GALICIA/52475765/2025_F76C5888", - # 2025-0611 no exclusions - # 2025-0618 no exclusions - # 2025-0625 no exclusions - # 2025-0702 - "PHUKET/P1187/2025_5A09E7D5", - # 2025-0716 (omitted exclusions in error,later added in 0723) - # 2025-0723 - "ODISHA/1421/2023_6422458A", - # 2025-0729 no exclusions - ]) - .include_name(outgroup) # outgroup BRISBANE/60/2008, B/CHONGQING BANAN/1840/2017 (V1A) + .find_masters() # to be able to use .filter below + .exclude( + lambda ref: ref.aa["165-"] + ) # exclude 6-del and MIE/1/2019 with garbage in the middle + .exclude_name( + [ + "ANHUI_JINGHU/63/2019", # garbage in 406-414 leading to a long branch (2021-0722) + "LISBOA/NISU182_17-18/2018", # yamagata, long branch 2021-1117 + # "ZAMBIA/503/2021", # potential long branch 2022-0401 + # 2023-01-28 potential long branch + "KRABI/FS002/2022", + "CATALONIA/NSVH151199301/2023_OR_0ACCE47F", # XXXX 2023-0421 + "SOUTH_AFRICA/R10785/2022_OR_0EB8BBEA", # XXX 2023-0515 + "SOUTH_AFRICA/R10907/2022_OR_BA5B358C", + "SOUTH_AFRICA/R09298/2022_OR_1A5C630A", + "SOUTH_AFRICA/R10914/2022_OR_ADA6D682", + "SOUTH_AFRICA/R10638/2022_OR_5EAB4CBB", + "SOUTH_AFRICA/R11310/2022_OR_12125C84", + "SOUTH_AFRICA/R11575/2022_OR_FED84F4F", + "SOUTH_AFRICA/R08092/2022_OR_93DDF2B2", + # XXX 2023-0602 + "LA_REUNION/HCL023041635201/2022_103E7754", + "ROMANIA/550950/2023_MDCK-SIAT_E0DECCD2", + "SAINT_PETERSBURG/RII-10866S/2023_OR_477275F7", + # XXX2023-0629 + "ABU_DHABI/4030/2022_AEC2DE48", + "RIO_GRANDE_DO_SUL/28773/2022_455F4FFF", + # 2023-0731 + "MALAYSIA/IMR-SARI0563/2023_25FE3E0E", + # 2023-0811 Yam + "KENYA/KCH157321/2019_OR_308AC34A", + # 2023-0822 XXX + "ALAGOAS/5742/2023_OR_441065AD", + "ALAGOAS/4659/2023_OR_558AC3D6", + "PARANA/5012/2023_OR_D54659DC", + # 2023-1005 XXXX + "UNITED_KINGDOM/UO-476/2023", + # 2024-0130 + "SANTA_CATARINA/10748/2023_OR_7B192215", + # 2024-0703 + "MADRID/HULP-200063972/2024_C98ADFD1", # unknown at deletion positions 162-164 + "MADRID/HULP-200041842/2024_5D4B7141", # unknown at deletion positions 162-164 + # 2024-0718 + "AUSTRIA/MUW1681066/2024_OR_3D9C21FB", # XXXX + "ATHENS/ILI411/2024_OR_556F1400", # XXXX + # 2024-0411 + "PAIS_VASCO/1919/2024_OR_1D682639", + "GALICIA/52314290/2024_AF2005E4", + # 2025-2411 + "MADAGASCAR/3422/2024_OR_DF3D589F", + "MEXICO/2429277701/2024_OR_3BF2F243", + "GALICIA/GA-CHUAC-250/2025_OR_AC92C5F3", + # 2025-1202 + "PAKISTAN/NIC-JHL/2010_OR_EC56A7FD", + "PAKISTAN/NIC-ICT-284/2025_OR_8DAE4B6A", + "PAKISTAN/NIC-ICT-218/2025_OR_362FAB6C", + "PAKISTAN/NMC-12/2025_OR_7B217EEC", + "PAKISTAN/NIC-ICT-190/2025_OR_D4D6D601", + "ANDALUCIA/PMC-00528/2025_OR_549D6405", + # 2025-0313 + "GALICIA/52463655/2025_0971E6C6", + # 2025-0507 + "THUERINGEN/31/2025_OR_758A40BD", + "FRANCE/ARA-HCL025053223301/2025_OR_1C544F25", + "ABAY/NRL-2330/2025_C2E364F7", + "RIO_DE_JANEIRO/533/2025_OR_C04A6642", + "LA_REUNION/HCL023041635201/2022_OR_103E7754", + "ASTURIAS/232465739/2024_EA8BEFAB", + "UKRAINE/3471/2025_6E494813", + # 2025-0521 + "GALICIA/52475765/2025_F76C5888", + # 2025-0611 no exclusions + # 2025-0618 no exclusions + # 2025-0625 no exclusions + # 2025-0702 + "PHUKET/P1187/2025_5A09E7D5", + # 2025-0716 (omitted exclusions in error,later added in 0723) + # 2025-0723 + "ODISHA/1421/2023_6422458A", + # 2025-0729 no exclusions + # 2025-0818 + "INDIA/PUD-NIV25-1188/2025_MDCK3_935FB615", + # 2025-0827 no exclusions + # 2025-0910 no exclusions + # 2025-0917 + "INDIA/PUD-NIV25-1188/2025_MDCK3_935FB615", + "AVILA/227/2024_FLU0789_03C986A9", + "AVILA/43/2025_FLU0874_624F5516", + "SALAMANCA/214/2024_FLU0771_DF1DA731", + "LA_REUNION/CHU725061509401/2025_7A8734BB", + # 2025-1006 + "MEXICO/240450000409/2024_OR_1174529C", + # 2025-1028 no exclusions + # 2025-1112 + "SANTA_CATARINA/4844/2025_OR_8B4C49A1", + "TOULON/INF_20/2023_OR_06B04496", + "TOULON/INF_21/2023_OR_C17020D2", + "SANTA_CATARINA/4290/2025_OR_27BF4310", + "INDIA/RES-23-4646/2023_AB3871EB", + "INDIA/RES-23-1799/2023_840D4617", + "INDIA/RES-23-2404/2023_159F0FCE", + "INDIA/RES-23-2414/2023_A4EA11A3", + # 2025-1126 + "ODISHA/956/2023_9DD47677", + "B/LEE/40_6BCD57B7", + ] + ) + .include_name( + outgroup + ) # outgroup BRISBANE/60/2008, B/CHONGQING BANAN/1840/2017 (V1A) .remove_hash_duplicates() .replace_with_master() .check() @@ -116,8 +174,10 @@ def bvic(subtype: str, args: argparse.Namespace): report(subtype, selected, expected_nuc_size=1710, limit=0) export(subtype, selected) + # ---------------------------------------------------------------------- + def byam(subtype: str, args: argparse.Namespace): outgroup = "MINNESOTA/2/2014" selected = ( @@ -127,9 +187,15 @@ def byam(subtype: str, args: argparse.Namespace): .lineage("YAMAGATA") .exclude_with_issue() .filter_dates(first="2015") - .find_masters() # to be able to use .filter below - .include_name([outgroup]) # , "IDAHO/1/2014", "MASSACHUSETTS/7/2014"]) - .exclude_name([lin for lin in (line.strip() for line in Path("byam-exclude.txt").open()) if lin and lin[0] != "#"]) + .find_masters() # to be able to use .filter below + .include_name([outgroup]) # , "IDAHO/1/2014", "MASSACHUSETTS/7/2014"]) + .exclude_name( + [ + lin + for lin in (line.strip() for line in Path("byam-exclude.txt").open()) + if lin and lin[0] != "#" + ] + ) .remove_hash_duplicates() .replace_with_master() .sort("+date") @@ -138,15 +204,21 @@ def byam(subtype: str, args: argparse.Namespace): report(subtype, selected, expected_nuc_size=1710, limit=0) export(subtype, selected) + # ---------------------------------------------------------------------- + def h1(subtype: str, args: argparse.Namespace): h1_include_txt = Path("h1-include.txt") if not h1_include_txt.exists(): h1_include_txt = Path(sys.argv[0]).parent.joinpath(h1_include_txt.name) if not h1_include_txt.exists(): raise RuntimeError("> cannot find {h1_include_txt}") - to_include = [lin for lin in (line.strip() for line in h1_include_txt.open()) if lin and lin[0] != "#"] + to_include = [ + lin + for lin in (line.strip() for line in h1_include_txt.open()) + if lin and lin[0] != "#" + ] # to_include = to_include[:-500] outgroup = "SOUTH_AUSTRALIA/30/2013" selected = ( @@ -155,218 +227,294 @@ def h1(subtype: str, args: argparse.Namespace): .human() .exclude_with_issue() .filter_dates(first="2019") - .find_masters() # to be able to use .filter below - .include_name([ - outgroup, - "MICHIGAN/45/2015", - "BRAZIL/5026/2017", # # 183S, trying to show "the different 183P clades has S183P independently", inferred from 2018-0924-ssm tree - ] + to_include) # ~/AD/share/conf/h1.183S.2017-2018.seqids - .exclude_name([ - "SWINE/", - # 2020-08-10 Sarah: They're not sequenced by a CC or NIC, but by a company. The sequences don't have obvious errors, but are very different (obviously) from the rest. My feeling is they're either variant viruses (from swine) or a sequencing error. - "ANKARA/14015-724/2019", - "ANKARA/14017-004/2019", - "ANKARA/14015-736/2019", - # 2021-06-22 deletion at the end - # "BADEN-WURTTEMBERG/1/2020", - # long branch 2021-0622 - "DENMARK/1/2021", - "MANITOBA/2/2021", - # 2021-1117 long branches - "MECKLENBURG-VORPOMMERN/1/2021", - "MECKLENBURG-VORPOMMERN/1-A/2021", # refers to MECKLENBURG-VORPOMMERN/1/2021 - "NETHERLANDS/10370-1/2020", - "NETHERLANDS/10370-1A/2020", - "NETHERLANDS/10370-2/2020", # refers to NETHERLANDS/10370-1A/2020 - "NETHERLANDS/GENT-193/2019", - "HESSEN/47/2020", - "HUBEI WUJIAGANG/1324/2020", - "HEBEI HAIGANG/SWL1572/2019", - "SHANDONG/204/2021", - "YUNNAN MENGZI/1462/2020", - "NORTH CAROLINA/15/2020", - "WISCONSIN/3/2021", - "PARANA/10835/2021", - # 2021-1125 long branches - "GANSU_XIFENG/1194/2021", - "GANSU_XIFENG/1143/2021", - "SICHUAN/1208/2021", - "TIANJIN/30/2020", - "WISCONSIN/5/2021", - "WISCONSIN/4/2021", - "NORTH_DAKOTA/12226/2021", - # 2021-1217 potential long branch - "IOWA/1/2021", # --> SWINE/MISSOURI/A01104146/2020 (base), SWINE/MISSOURI/A02525160/2021 - "IOWA/6/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" - "NORTH DAKOTA/12226/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" - # 2021-1217 long branches - "NETHERLANDS/10370-1A/2020", - # 2022-0131 potential long branch - "DENMARK/36/2021", - # 2022-04-26 mess in 156-179 - "INDIA/NIV-NICOBAR91/2020_OR_01A87F80", - "INDIA/NIV-NICOBAR163/2020_OR_823B2A0F", - "INDIA/NIV-NICOBAR106/2020_OR_B81F1DC2", - "INDIA/NIV-NICOBAR164/2020_OR_37A11133", - # 2022-10-07 potential long branch - "PARANA/20675/2022_OR_3D8D671C", - # 2022-12-10 potential long branch - "DAKAR/3124/2021_279E04A6", - # 2022-12-23 potential long branch - "NAVARRA/4050/2022_OR_BBC31B3A", - # 2023-01-13 potential long branch - "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", - # 2023-01-28 potential long branches - "NAVARRA/4050/2022_OR_BBC31B3A", - "DAKAR/3124/2021_279E04A6 pdm09-like", - "PAKISTAN/GIHSN-GB-445/2022_OR_A20D55B6", - "PAKISTAN/GIHSN-ICT-152/2023_OR_572EBEB8", - "PAKISTAN/GIHSN-PB-15/2023_OR_484A375D", - "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", - "CATALONIA/NSVH101996495/2023_OR_2E610047", - # 2023-02-04 potential long branch - "SWITZERLAND/69094/2022_OR_B3C15C9B", - # 2023-0421 potential long branches - "CATALONIA/NSVH510598655/2022_OR_57BCC58B", - "PAKISTAN/1465/2023_OR_1B7B0A1A", - "BADEN-WURTTEMBERG/159/2022_OR_6AB83E7E", - "PARANA/44706/2021_OR_25BFEB40", - # XXX 2023-0515 - "A/PAKISTAN/GIHSN/ICT/1838/2023_0867EDC7", - "A/PAKISTAN/GIHSN/ICT/1634/2023_OR_3D55578F", - "A/PAKISTAN/GIHSN/ICT/1798/2023_OR_C271B3C9", - "A/PAKISTAN/GIHSN/ICT/1682/2023_OR_FB7A0386", - # XXX 2023-0602 - "PAKISTAN/GIHSN-ICT2035/2023_OR_4E73FFF6", - # XXX 2023-0731 - "CYCLADES/215/2022_D164055B", - #long branch & XXXX 2023-0811 - "GREECE/SRIHER-001/2022_OR_E68E256A", - "SAUDI_ARABIA/40/2020_OR_9B1E74A4", - "SAUDI_ARABIA/39/2020_OR_DAE6A100", - # 2023-0822 XXXX - "PARANA/6405/2023_OR_9ACD4010", - "BAHIA/5785/2023_OR_CBFA332A", - # 2023-0903 very long branch - "NETHERLANDS/10534/2023_OR_6221D5BF", - # 2023-09-08 long branch in tree - "BADAJOZ/18570374/2023_58158069", - # 2023-09-15 XXXX - "ALGIERS/251/2023_OR_1A27768", - # 2023-10-23 XXX & odd start for SAO_PAULO - "SAO_PAULO/CEVI_2300134/2023_OR_BB923F69", - "CAMBODIA/KVH230151/2023_OR_1F665D9C", - "CAMBODIA/AHC230229/2023_OR_3241A3FC", - # 2023-20-28 - "BULGARIA/725/2023_8A6AF67F", # nonsense 1-29 - "CAMBODIA/CCH230213/2023_OR_BE937275", # XXXX - # 2023-12-04 - "BANGKOK/P5113/2023_OR_CB7624FD", # long branch - # 2024-01-01 - "SWITZERLAND/114/2023_MDCK1_A2D22DAA", # long branch - # 2024-01-30 - "CATALONIA/NSAV198289092/2023_OR_291AD7ED" - "PARANA/13897/2023_OR_0A657176", - "BELGIUM/H00082/2023_56A80548", - "CACERES/59816382/2023_A1324011", - "SENEGAL/2098/2023_OR_2EC7BF97", - # 2024-0703 - "GALICIA/GA-CHUAC-98/2024_OR_988888E6", # 29 nuc differences, 19 nuc differences - "MEXICO/2271/2021_338E9C30", # run of differences, 35-38, 40-42 - "ALAGOAS/CEVI_2200198/2021_OR_0811FCC2", # 6 X - "MADRID/1832324191549/2024_OR_988BC424", # 9 X - "HUMAN/KENYA/5866/2023_3F1D8A0B", # 9X - # 2024-0718 - "SAO_PAULO/CEVI_24000816/2024_OR_BC9F903C", # XXXX - "SAO_PAULO/CEVI_24000889/2024_OR_C69399E2", # dels - # 2024-0727 - "LUXEMBOURG/LNS3963248/2024_04C1C479", # XXXX - "COSTA_RICA/INC-0552-822365/2024_OR_9E1DE43D", # XXXX - # 2024-08-06 - "CHANTHABURI/P2877/2024_OR_5B993EF6", # XXXXX - "SRI_LANKA/R4_BC75/2024_50C5FC20" # XXXXX - # 2024-0830 XXXX & many changes - "OHIO/251/2024_OR_30CB2DAF", - "HANOI/PF324163/2024_OR_402B095E", - "SOUTH_AFRICA/PET32788/2024_OR_47322CBB", - # 2024-0902 long branch - "YAOUNDE/24V-3413/2024_8196548B", - # 2024-1004 - "PHUKET/P3423/2024_OR_5E3C7D19", # XXXX - # 2024-1025 - "PARANA/10835/2021_OR_8025A28A", - "TOLEDO/PARANA_10835/2021_OR_8025A28A", - "ENGLAND/3640434/2024_OR_A126DDAB", - "CATALONIA/NSAV198309324/2024_OR_29D1522E", - # 2024-1126 - "OREGON/FLU-OHSU-242090049/2024_05BD35BD", - "OREGON/FLU-OHSU-242090044/2024_DEC6E084", - # 2024-1209 - "NAKHON_PHANOM/F2539/2024_OR_276AD388", - # 2025-0901 - "CAMBODIA/KPH240133/2024_OR_CD9DB776", - "ENGLAND/4820116/2024_OR_F5C11575", - # 2025-2401 - "GALICIA/GA-CHUAC-172/2024_OR_000994B0", - "ROMANIA/2920/2024_F73C0E92", - # 2025-1202 - "BALTIMORE/JH-173/2025_A2FCE291", - "CROATIA/HZJZ-8235/2025_170E7DD7", - "ENGLAND/340238/2025_OR_EA36F218", - "ENGLAND/121421/2024_OR_2094499C", - "ANDALUCIA/PMC-00542/2025_OR_BD2FE74E", - "PAKISTAN/NIC-ICT-576/2025_OR_6924EA8E", - "ANDALUCIA/PMC-00549/2025_OR_A6464885", - "CASTELLON/VAHNSI-GIHSN-02_0386/2025_OR_2E26A6D3", - # 2025-0313 - "QUEENSLAND/IN000908/2025_91CEAA08", - "FRANCE/HDF-RELAB-IPP01090/2025_OR_096FF1D6", - "CATALONIA/NSVH171785721/2025_OR_C896185E", - "KAMCHATKA/205-20V/2025_OR_68CABAFF", - # 2025-0507 - "ENGLAND/5100240/2024_OR_EB872B9D", - "CAMBODIA/CCH250029/2025_OR_A4964B9E", - "ASTURIAS/232523061/2025_58877E98", - "PAKISTAN/NIC_SD193/2025_OR_3A27DCA6", - "ESPIRITO_SANTO/475/2025_OR_4FB190FF", - "PAKISTAN/NIC_SD184/2025_OR_66AF2F25", - "PAKISTAN/NIC_JHL115/2025_OR_53A17665", - "GALICIA/22647653/2025_DE4A5E72", - "IRELAND/4321/2025_8EB775D5", - "ENGLAND/640250/2025_OR_01187600", - "PAKISTAN/NIC_NIH749/2025_OR_40914714", - "BELGIUM/S4050/2025_OR_E331B39E", - "GALICIA/GA-CHUAC-314/2025_OR_59908BA2", - "MURCIA/528/2025_OR_E7444F60", - "OREL/SWINE-RII-WD2678S/2025_OR_6C9DA738", - "OREL/SWINE-RII-WD2681S/2025_OR_B004CABD", - "VERMONT/UVM-0478/2022_OR_DFD37A87", - "VERMONT/UVM-1652/2022_OR_42E6DF65", - "FRANCE/NOR-IPP04189/2025_OR_88070E22", # caused tree failure - # 2025-0716 (omitted exclusions in error, added 29/07, some added 0723 PR) - "CAMBODIA/SVH250268/2025_OR_238D9DD5", - "CAMBODIA/IAHC250126/2025_OR_FDAC2F6D", - "CAMBODIA/IKCM250040/2025_OR_7BD02395", - "CAMBODIA/CCH250037/2025_OR_347A5F98", - # 2025-0723 - "NIZHNIY_NOVGOROD/RII-MH239577S/2025_OR_B6AF35B5", - "CAMBODIA/KVH250142/2025_OR_3F8690DC", - "CAMBODIA/IAHC250133/2025_OR_C34D3AC3", - "SURGUT/RII-MH237071S/2025_OR_3A47CC28", - "MOROCCO/1168/2025_OR_910D767C", - # 2025-0729 - "ARKHANGELSK/RII-MH246940S/2025_OR_78BF9CC7", - ]) + .find_masters() # to be able to use .filter below + .include_name( + [ + outgroup, + "MICHIGAN/45/2015", + "BRAZIL/5026/2017", # # 183S, trying to show "the different 183P clades has S183P independently", inferred from 2018-0924-ssm tree + ] + + to_include + ) # ~/AD/share/conf/h1.183S.2017-2018.seqids + .exclude_name( + [ + "SWINE/", + # 2020-08-10 Sarah: They're not sequenced by a CC or NIC, but by a company. The sequences don't have obvious errors, but are very different (obviously) from the rest. My feeling is they're either variant viruses (from swine) or a sequencing error. + "ANKARA/14015-724/2019", + "ANKARA/14017-004/2019", + "ANKARA/14015-736/2019", + # 2021-06-22 deletion at the end + # "BADEN-WURTTEMBERG/1/2020", + # long branch 2021-0622 + "DENMARK/1/2021", + "MANITOBA/2/2021", + # 2021-1117 long branches + "MECKLENBURG-VORPOMMERN/1/2021", + "MECKLENBURG-VORPOMMERN/1-A/2021", # refers to MECKLENBURG-VORPOMMERN/1/2021 + "NETHERLANDS/10370-1/2020", + "NETHERLANDS/10370-1A/2020", + "NETHERLANDS/10370-2/2020", # refers to NETHERLANDS/10370-1A/2020 + "NETHERLANDS/GENT-193/2019", + "HESSEN/47/2020", + "HUBEI WUJIAGANG/1324/2020", + "HEBEI HAIGANG/SWL1572/2019", + "SHANDONG/204/2021", + "YUNNAN MENGZI/1462/2020", + "NORTH CAROLINA/15/2020", + "WISCONSIN/3/2021", + "PARANA/10835/2021", + # 2021-1125 long branches + "GANSU_XIFENG/1194/2021", + "GANSU_XIFENG/1143/2021", + "SICHUAN/1208/2021", + "TIANJIN/30/2020", + "WISCONSIN/5/2021", + "WISCONSIN/4/2021", + "NORTH_DAKOTA/12226/2021", + # 2021-1217 potential long branch + "IOWA/1/2021", # --> SWINE/MISSOURI/A01104146/2020 (base), SWINE/MISSOURI/A02525160/2021 + "IOWA/6/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" + "NORTH DAKOTA/12226/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" + # 2021-1217 long branches + "NETHERLANDS/10370-1A/2020", + # 2022-0131 potential long branch + "DENMARK/36/2021", + # 2022-04-26 mess in 156-179 + "INDIA/NIV-NICOBAR91/2020_OR_01A87F80", + "INDIA/NIV-NICOBAR163/2020_OR_823B2A0F", + "INDIA/NIV-NICOBAR106/2020_OR_B81F1DC2", + "INDIA/NIV-NICOBAR164/2020_OR_37A11133", + # 2022-10-07 potential long branch + "PARANA/20675/2022_OR_3D8D671C", + # 2022-12-10 potential long branch + "DAKAR/3124/2021_279E04A6", + # 2022-12-23 potential long branch + "NAVARRA/4050/2022_OR_BBC31B3A", + # 2023-01-13 potential long branch + "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", + # 2023-01-28 potential long branches + "NAVARRA/4050/2022_OR_BBC31B3A", + "DAKAR/3124/2021_279E04A6 pdm09-like", + "PAKISTAN/GIHSN-GB-445/2022_OR_A20D55B6", + "PAKISTAN/GIHSN-ICT-152/2023_OR_572EBEB8", + "PAKISTAN/GIHSN-PB-15/2023_OR_484A375D", + "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", + "CATALONIA/NSVH101996495/2023_OR_2E610047", + # 2023-02-04 potential long branch + "SWITZERLAND/69094/2022_OR_B3C15C9B", + # 2023-0421 potential long branches + "CATALONIA/NSVH510598655/2022_OR_57BCC58B", + "PAKISTAN/1465/2023_OR_1B7B0A1A", + "BADEN-WURTTEMBERG/159/2022_OR_6AB83E7E", + "PARANA/44706/2021_OR_25BFEB40", + # XXX 2023-0515 + "A/PAKISTAN/GIHSN/ICT/1838/2023_0867EDC7", + "A/PAKISTAN/GIHSN/ICT/1634/2023_OR_3D55578F", + "A/PAKISTAN/GIHSN/ICT/1798/2023_OR_C271B3C9", + "A/PAKISTAN/GIHSN/ICT/1682/2023_OR_FB7A0386", + # XXX 2023-0602 + "PAKISTAN/GIHSN-ICT2035/2023_OR_4E73FFF6", + # XXX 2023-0731 + "CYCLADES/215/2022_D164055B", + # long branch & XXXX 2023-0811 + "GREECE/SRIHER-001/2022_OR_E68E256A", + "SAUDI_ARABIA/40/2020_OR_9B1E74A4", + "SAUDI_ARABIA/39/2020_OR_DAE6A100", + # 2023-0822 XXXX + "PARANA/6405/2023_OR_9ACD4010", + "BAHIA/5785/2023_OR_CBFA332A", + # 2023-0903 very long branch + "NETHERLANDS/10534/2023_OR_6221D5BF", + # 2023-09-08 long branch in tree + "BADAJOZ/18570374/2023_58158069", + # 2023-09-15 XXXX + "ALGIERS/251/2023_OR_1A27768", + # 2023-10-23 XXX & odd start for SAO_PAULO + "SAO_PAULO/CEVI_2300134/2023_OR_BB923F69", + "CAMBODIA/KVH230151/2023_OR_1F665D9C", + "CAMBODIA/AHC230229/2023_OR_3241A3FC", + # 2023-20-28 + "BULGARIA/725/2023_8A6AF67F", # nonsense 1-29 + "CAMBODIA/CCH230213/2023_OR_BE937275", # XXXX + # 2023-12-04 + "BANGKOK/P5113/2023_OR_CB7624FD", # long branch + # 2024-01-01 + "SWITZERLAND/114/2023_MDCK1_A2D22DAA", # long branch + # 2024-01-30 + "CATALONIA/NSAV198289092/2023_OR_291AD7EDPARANA/13897/2023_OR_0A657176", + "BELGIUM/H00082/2023_56A80548", + "CACERES/59816382/2023_A1324011", + "SENEGAL/2098/2023_OR_2EC7BF97", + # 2024-0703 + "GALICIA/GA-CHUAC-98/2024_OR_988888E6", # 29 nuc differences, 19 nuc differences + "MEXICO/2271/2021_338E9C30", # run of differences, 35-38, 40-42 + "ALAGOAS/CEVI_2200198/2021_OR_0811FCC2", # 6 X + "MADRID/1832324191549/2024_OR_988BC424", # 9 X + "HUMAN/KENYA/5866/2023_3F1D8A0B", # 9X + # 2024-0718 + "SAO_PAULO/CEVI_24000816/2024_OR_BC9F903C", # XXXX + "SAO_PAULO/CEVI_24000889/2024_OR_C69399E2", # dels + # 2024-0727 + "LUXEMBOURG/LNS3963248/2024_04C1C479", # XXXX + "COSTA_RICA/INC-0552-822365/2024_OR_9E1DE43D", # XXXX + # 2024-08-06 + "CHANTHABURI/P2877/2024_OR_5B993EF6", # XXXXX + "SRI_LANKA/R4_BC75/2024_50C5FC20" # XXXXX + # 2024-0830 XXXX & many changes + "OHIO/251/2024_OR_30CB2DAF", + "HANOI/PF324163/2024_OR_402B095E", + "SOUTH_AFRICA/PET32788/2024_OR_47322CBB", + # 2024-0902 long branch + "YAOUNDE/24V-3413/2024_8196548B", + # 2024-1004 + "PHUKET/P3423/2024_OR_5E3C7D19", # XXXX + # 2024-1025 + "PARANA/10835/2021_OR_8025A28A", + "TOLEDO/PARANA_10835/2021_OR_8025A28A", + "ENGLAND/3640434/2024_OR_A126DDAB", + "CATALONIA/NSAV198309324/2024_OR_29D1522E", + # 2024-1126 + "OREGON/FLU-OHSU-242090049/2024_05BD35BD", + "OREGON/FLU-OHSU-242090044/2024_DEC6E084", + # 2024-1209 + "NAKHON_PHANOM/F2539/2024_OR_276AD388", + # 2025-0901 + "CAMBODIA/KPH240133/2024_OR_CD9DB776", + "ENGLAND/4820116/2024_OR_F5C11575", + # 2025-2401 + "GALICIA/GA-CHUAC-172/2024_OR_000994B0", + "ROMANIA/2920/2024_F73C0E92", + # 2025-1202 + "BALTIMORE/JH-173/2025_A2FCE291", + "CROATIA/HZJZ-8235/2025_170E7DD7", + "ENGLAND/340238/2025_OR_EA36F218", + "ENGLAND/121421/2024_OR_2094499C", + "ANDALUCIA/PMC-00542/2025_OR_BD2FE74E", + "PAKISTAN/NIC-ICT-576/2025_OR_6924EA8E", + "ANDALUCIA/PMC-00549/2025_OR_A6464885", + "CASTELLON/VAHNSI-GIHSN-02_0386/2025_OR_2E26A6D3", + # 2025-0313 + "QUEENSLAND/IN000908/2025_91CEAA08", + "FRANCE/HDF-RELAB-IPP01090/2025_OR_096FF1D6", + "CATALONIA/NSVH171785721/2025_OR_C896185E", + "KAMCHATKA/205-20V/2025_OR_68CABAFF", + # 2025-0507 + "ENGLAND/5100240/2024_OR_EB872B9D", + "CAMBODIA/CCH250029/2025_OR_A4964B9E", + "ASTURIAS/232523061/2025_58877E98", + "PAKISTAN/NIC_SD193/2025_OR_3A27DCA6", + "ESPIRITO_SANTO/475/2025_OR_4FB190FF", + "PAKISTAN/NIC_SD184/2025_OR_66AF2F25", + "PAKISTAN/NIC_JHL115/2025_OR_53A17665", + "GALICIA/22647653/2025_DE4A5E72", + "IRELAND/4321/2025_8EB775D5", + "ENGLAND/640250/2025_OR_01187600", + "PAKISTAN/NIC_NIH749/2025_OR_40914714", + "BELGIUM/S4050/2025_OR_E331B39E", + "GALICIA/GA-CHUAC-314/2025_OR_59908BA2", + "MURCIA/528/2025_OR_E7444F60", + "OREL/SWINE-RII-WD2678S/2025_OR_6C9DA738", + "OREL/SWINE-RII-WD2681S/2025_OR_B004CABD", + "VERMONT/UVM-0478/2022_OR_DFD37A87", + "VERMONT/UVM-1652/2022_OR_42E6DF65", + "FRANCE/NOR-IPP04189/2025_OR_88070E22", # caused tree failure + # 2025-0716 (omitted exclusions in error, added 29/07, some added 0723 PR) + "CAMBODIA/SVH250268/2025_OR_238D9DD5", + "CAMBODIA/IAHC250126/2025_OR_FDAC2F6D", + "CAMBODIA/IKCM250040/2025_OR_7BD02395", + "CAMBODIA/CCH250037/2025_OR_347A5F98", + # 2025-0723 + "NIZHNIY_NOVGOROD/RII-MH239577S/2025_OR_B6AF35B5", + "CAMBODIA/KVH250142/2025_OR_3F8690DC", + "CAMBODIA/IAHC250133/2025_OR_C34D3AC3", + "SURGUT/RII-MH237071S/2025_OR_3A47CC28", + "MOROCCO/1168/2025_OR_910D767C", + # 2025-0729 + "ARKHANGELSK/RII-MH246940S/2025_OR_78BF9CC7", + # 2025-0818 + "CATALONIA/NSVH161521968/2025_OR_A0FA1338", + "CAMBODIA/NPH250451/2025_OR_3F664AAE", + "CAMBODIA/IAHC250237/2025_OR_E4433FD6", + "VALENCIA/VAHNSI-GIHSN-09_0742/2025_OR_30A76211", + "SAO_PAULO/CEVI_2500217/2025_OR_A0D5251B", + "SAO_PAULO/CEVI_2500207/2025_OR_3C666E5E", + "HONG_KONG/F40/2024_OR_94A307F6", + # 2025-0827 + "KRASNOGORSK/RII-MH248987S/2025_OR_CA8C4660", + # 2025-0910 no exclusions + # 2025-0917 + "CATALONIA/NSVH161521968/2025_OR_A0FA1338", + "CAMBODIA/IAHC250237/2025_OR_E4433FD6", + "CAMBODIA/NPH250451/2025_OR_3F664AAE", + "VALENCIA/VAHNSI-GIHSN-09_0742/2025_OR_30A76211", + "SAO_PAULO/CEVI_2500207/2025_OR_3C666E5E", + "KRASNOGORSK/RII-MH248987S/2025_OR_CA8C4660", + "HONG_KONG/F40/2024_OR_94A307F6", + # 2025-1006 no exclusions + # 2025-1028 + "SENEGAL/IPD-2503/2025_OR_20CCF2D9", + "POLAND/D7325_10/2023_763600E9", + "NETHERLANDS/D6904_36/2022_66C9DF52", + "POLAND/D7325_7/2023_CE4D3423", + "NETHERLANDS/D7061_16/2022_8D08423F", + "SPAIN/D7228_1/2022_F78E9BF7", + "AUSTRIA/D7225_20/2023_12AC9B5C", + "SPAIN/D7228_40/2023_36844405", + "BELGIUM/D6840_21/2022_32C5D497", + "SPAIN/D7228_50/2023_99D2EC08", + "ITALY/D7434_7/2023_BC9D4B6B", + "PORTUGAL/D7272_2/2023_0ED2CAE8", + "GERMANY/D7225_5/2023_13A6321B", + "GERMANY/D7225_17/2023_E6C859C1", + "POLAND/D7325_11/2023_906FE020", + "NETHERLANDS/D6840_51/2022_E7F7E7FB", + "GERMANY/D7061_22/2023_33D65F89", + "POLAND/D6904_24/2022_AF273C3D", + "POLAND/D7325_23/2023_36B8A0E4", + "NETHERLANDS/D7061_27/2022_DCE07BA1", + "POLAND/D6904_5/2022_240EA095", + "POLAND/D6840_59/2022_95E58799", + "BELGIUM/D7157_61/2023_6A0D50EA", + "GERMANY/D7061_33/2022_6DD93351", + "SLOVENIA/D7378_1_1/2023_473636B1", + "GERMANY/D7061_8/2022_5C22171F", + "GERMANY/D7225_1/2023_6BCBB1AC", + "GERMANY/D7061_11/2023_EA3CB29E", + "GERMANY/D7061_9/2023_D612A4DA", + "BULGARIA/D6904_28/2022_74CF41D4", + "BULGARIA/D6904_25/2022_DA88ADA9", + "SPAIN/D7228_24/2023_F14A6077", + # 2025-1112 + "RIYADH/63/2021_OR_677764E9", + "RIYADH/65/2022_OR_B365D7CC", + "RIYADH/19/2021_OR_9E0E1562", + "RIYADH/88/2022_OR_8E98C98E", + "RIYADH/100/2022_OR_8B809E4C", + "RIYADH/111/2023_OR_886090A3", + "RIYADH/120/2023_OR_EA0E7626", + "RIYADH/49/2022_OR_DB27594B", + "RIYADH/88/2023_OR_E11E47BF", + "RIYADH/66/2022_OR_03C7C7E0", + "TOULON/INF_4/2023_OR_A4196D00", + # 2025-1126 + "CAMBODIA/KPH250113/2025_OR_EABF2384", + "CAMBODIA/IAHC250287/2025_OR_3BF3D0EB", + "CAMBODIA/IMDK250044/2025_OR_037B06A4", + "ASTURIAS/232557424/2025_A6C56596", + ] + ) .remove_hash_duplicates() .replace_with_master() .sort("+date") .move_name_to_beginning(outgroup) ) - report(subtype, selected, expected_nuc_size=1647, limit=0) # aa 549 + report(subtype, selected, expected_nuc_size=1647, limit=0) # aa 549 export(subtype, selected) + # ---------------------------------------------------------------------- + def h3(subtype: str, args: argparse.Namespace): outgroup = "PERTH/16/2009" selected = ( @@ -375,272 +523,321 @@ def h3(subtype: str, args: argparse.Namespace): .human() .exclude_with_issue() .filter_dates(first="2018-03") - .find_masters() # to be able to use .filter below - .include_name([ - outgroup, - "VICTORIA/361/2011", # 2011-10-24 3C.3 - # [2020-09-07] Nearest common ancestor of 2a and 3a in /syn/eu/ac/results/eu/2019-0417-h3-trees/cdc-usa-2009-2019-250-per-year-1.rough-29532.tree.pdf https://notebooks.antigenic-cartography.org/eu/results/eu/2019-0417-h3-trees/ - "MARYLAND/30/2012", # 2012-07-26 3C.3 - # [2020-08-17] Derek thinks that the root (A/STOCKHOLM/6/2014) is not old enough - "TEXAS/50/2012", # 2012-04-15 3C.3 root in MELB tree 2020 - "VERMONT/6/2012", - "STOCKHOLM/6/2014", # 2014-02-06 3a - # [2020-02-07] intermediate strains from nextflu https://nextstrain.org/flu/seasonal/h3n2/ha/2y to keep 3a at the top - "SWITZERLAND/9715293/2013", # 2013-12-06 3a - "NORWAY/466/2014", # 2014-02-03 3a - "SOUTH_AUSTRALIA/55/2014", # 2014-06-29 3a - "TASMANIA/11/2014", # 2014-03-16 3a - "KOBE/63/2014", # 2014-05-21 3a - "PERU/27/2015", # 2015-04-13 3a - "NEVADA/22/2016", # 2016-03-05 3a - "IDAHO/33/2016", # 2016-06-08 3a - "TEXAS/88/2016", # 2016-02-25 3a - "TEXAS/71/2017", # 2017-03-18 3a - "BRAZIL/7331/2018", # 2018-07-09 3a - "KANSAS/14/2017", # 2017-12-14 3a, to have serum circles in the sig page - "HONG_KONG/4801/2014", # 2014-02-26 2a - "HAWAII/47/2014", # 2014-07-18 2a - "NORTH_CAROLINA/4/2017", # 2017-01-26 2a2 - "NORTH_CAROLINA/4/2016", # 2016-01-14 2a1 - "ANTANANARIVO/1067/2016", # 2016-04-06 2a1 - "HONG_KONG/2286/2017", # 2017-05-23 2a1b 135K - "WISCONSIN/327/2017", # 2017-09-22 2a1b 135K - "ABU_DHABI/240/2018", # 2018-01-01 2a1b 135K - "JAMAICA/1447/2018", # 2018-02-19 2a1b 131K - - # Strains before and after T135N substitution to have a better 135N branch placement - # Sarah 2021-02-08 17:05 - "WISCONSIN/85/2016", - "SRI_LANKA/56/2017", - "SOUTH_CAROLINA/4/2017", - "YOKOHAMA/145/2017", - "INDIA/9930/2017", - "HONG_KONG/3118/2017", - "HAWAII/47/2014", - "NIIGATA-C/43/2015", - "DAKAR/17/2016", - # "CAMEROON/16V-9267/2016", # excluded, truncated sequence - "LAOS/3008/2016", - "YUNNAN_LINXIANG/1718/2016", - "HONG_KONG/2302/2016", - "ONTARIO/RV2414/2015", - "ONTARIO/RV2414/2015", - "HONG_KONG/2286/2017", - "HONG_KONG/2286/2017", - "HONG_KONG/2286/2017", - "HONG_KONG/2286/2017", - "HONG_KONG/2286/2017", - ]) - .exclude_name([ - "SWINE/", - # long branch (2021-0622) - "WISCONSIN/1/2021", - # long branch (2021-1117) - "HAWAII/28/2020", - "SOUTH_AUSTRALIA/85/2018", - "SOUTH_AUSTRALIA/1/2021", - "MANITOBA/3/2021", - "GUANGXI_GANGBEI/190/2019", - # 2021-1217 potential long branch - "OHIO/6/2021_ISOLATE", - # 2022-0204 potential long branch - "FRANCE/ARA-HCL021219268801/2021", - "FRANCE/ARA-HCL021220304101/2021", - # 2022-0209 long branch, eight X, R in nuc - "ONTARIO/RV2002/2019_P1_0F1E8F1E", - # 2022-0415 potential long branch - "ACRE/181823-IEC/2021", - # 2022-0819 potential long branch - "CATALONIA/NSVH101883267/2022_OR_F79C7813", - # 2022-0825 long branch in the tree - "INDIANA/27/2018_SIAT1_26C3A2A4", - "SWINE/KENTUCKY/18TOSU1939/2018_26C3A2A4", - "SWINE/ILLINOIS/18TOSU1411/2018_26C3A2A4", - # 2022-0826 potential long branch - "FIJI/99/2022_OR_80B95BF8", - "FIJI/102/2022_OR_6AEB79D8", - "WEST_VIRGINIA/25/2022_SIAT1_3E592413", - "WEST_VIRGINIA/26/2022_SIAT1_3E592413", - "WEST_VIRGINIA/24/2022_SIAT1_2FDA4DA3", - # 2022-0902 potential long branch - "FRANCE/ARA-HCL022060609601/2022_65F4520B", - # 2022-10-07 potential long branch - "CATALONIA/NSVH101908168/2022_OR_D5C0176E", - # 2022-10-28 potential long branch - "MICHIGAN/48/2022_OR_910196F9", - # 2022-11-04 potential long branch - "NEW_MEXICO/19/2022_IRX_4BA11056", - # 2022-11-11 potential long branches - "CATALONIA/NSVH534131326/2022_OR_4AC47F3F", - "A/ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_MIXED_ECE1_DCC5FAD7", - "ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_ECE1_DCC5FAD7", - # 2022-11-26 potential long branches - "CATALONIA/NSVH101954411/2022_OR_7A633967", - "CATALONIA/NSVH510598655/2022_OR_57BCC58B", - # 2022-12-02 potential long branches - "MISSOURI/19/2022_OR_837C934E", - # 2022-12-10 potential long branches - "CATALONIA/NSVH560028269/2022_OR_9C189CFA", # 159X 160X 162X 163X 164X - "ATHENS/355/2022_D9D4A86A", # 38X 40X 41X 42X 59X 67X 81X 128X - "MICHIGAN/48/2022_OR_910196F9", - "MICHIGAN/UOM10047284024/2022_OR_910196F9", - # 2023-01-08 potential long branches - "BRITISH_COLUMBIA/PHL-480/2022_OR_EAAE7C01", - # 2023-01-13 potential long branch - "GALICIA/22034482/2022_2EE89148", - "GALICIA/22018524/2022_1EE46082", - "GALICIA/22033804/2022_0EE699D6", - "GALICIA/21987208/2022_F49E5748", - "GALICIA/22036079/2022_430E996A", - "GALICIA/22018743/2022_0116B017", - "GALICIA/22011636/2022_3D9F40AD", - # 2023-01-20 potential long branch - "SINGAPORE/GP9117/2022_OR_69761A51", - "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", - # 2023-01-28 potential long branches - "MICHIGAN/48/2022_OR_910196F9", - "MACEDONIA/IPH-MKD-59729/2021_OR_837D0C9D", - "KHON_KAEN/FS005/2022_OR_1A5637D8", - "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", - "ATHENS/355/2022_D9D4A86", - "CATALONIA/NSVH101994299/2023_OR_13A43DA9", - "MACEDONIA/IPH-MKD-HD/2021_OR_B52D1807", - "SOUTH_AFRICA/R07603/2022_OR_7937FE46", - "SOUTH_AFRICA/R05772/2022_OR_552B5F5E", - "GALICIA/22034482/2022_2EE89148", - "CATALONIA/NSVH560028269/2022_OR_9C189CFA", - # 2023-02-04 potential long branches - "MANITOBA/RV00728/2022_P1_5236154F", - "ONTARIO/198/2022_OR_D1EC3B40", - # 2023-04-21 potential long branches - "ROMANIA/493/2023_612606B9", - "PAKISTAN/1140/2023_OR_3D2D111B", - "CATALONIA/NSVH102045605/2023_OR_7490FFA8", - # XXX & long branch 2023-0731 - "BAHIA/PVM105707/2021_OR_BCE07E46", - "SOUTH_AFRICA/K056301/2023_30E0E639", - # XXXXX 2023-0811 CORRECTED 2023-1023 - "BAHIA/292038965/2021_70230607", - "BAHIA/292104308/2022_D4526D62", - # XXXXX 2023-0829 CORRECTED 2023-1023 - "SOUTH_AFRICA/PET28428/2023_7572E223", - "SOUTH_AFRICA/PET28193/2023_BF46BBB4", - "NORWAY/8084/2023_OR_4548A267", - # 2023-09-08 potential long branch ?why - "SOUTH_AFRICA/K056301/2023_CC1FE952" - # 2023-10-23 XXX - "PARA/CEVI_2200263/2021_OR_6D24213E", - "SAO_PAULO/CEVI_2200855/2022_OR_D4E825BF", - "SAO_PAULO/CEVI_2200871/2022_OR_280FCE18", - # 2023-10-28 - "SOUTH_AFRICA/K056301/2023_CC1FE952", # very divergent - # 2023-12-04 XXXX - "OMAN/CPHL_52323056/2023_OR_297A7E60", - "LUSAKA/1-NIC-332/2023_OR_BA018CE9", - "NORWAY/11072/2023_OR_EB1EA580", - # 2024-01-01 - "SAINT_PETERSBURG/RII-MH151473S/2023_OR_D20D279D", # XX - "SAINT_PETERSBURG/RII-4937S/2023_OR_95887A8C", # XX - "CANADA/NOVASCOTIA-32036/2022_A90EF453", # first 7 aa - "BRITISH_COLUMBIA/PHL-602/2022_OR_A1B8E845", # XX - # 2024-01-22 for 2024-01-16 file - "INDONESIA/BIOKES-IBKL1127/2023_OR_791937E7", # long branch - "SAO_PAULO/CEVI_2201055/2022_OR_BF9D3752", # XX - "ROMANIA/3227/2023_E20D207E", #XX - "SAO_PAULO/CEVI_2200903/2022_OR_BB30A339", # XX - "CACERES/50644480/2023_361FA786", # long branch - "BELGIUM/S1112/2023_OR_D7ABA060", # long branch - "BELGIUM/S0576/2023_OR_CEC5B896", # long branch - "BELGIUM/S0220/2023_OR_0B549607", # long branch - "BELGIUM/H0592/2022_OR_22BD830C", # long branch - # 2024-01-30 - "SPAIN/GA-HULA-615019/2023_B7077195", - "BELGIUM/S03941/2023_F4250630", - "NEW_YORK/392/2004_CFB2F75C", - "SPAIN/GA-HULA-615037/2023_E8183B9E", - "CACERES/59816379/2023_4D9F91F8", - "TURKEY/GIHSN_ANKARA-140111_950/2023_E0D8B88D", - # 2024-02-11 XXXX Indonesia is long branch - "KENYA/AFI-KDH-1820/2023_B3509344", - "INDONESIA/BIOKES-IDSW0063/2023_OR_5962326E", - "KENYA/AFI-ISB-1107/2023_E5F19B22", - "KENYA/AFI-MNB-1100/2023_6D42271", - # 2024-0703 - "CHIANG_RAI/P2287/2024_OR_2744AA9C", # 3 deletions - # 2024-0727 - "SASKATCHEWAN/RV03900/2024_OR_029CB62D", # 198 diffs - "WASHINGTON/S25532/2024_OR_13A107A6", # XXXX - "OREGON/S25529/2024_OR_5502D106", # XXXX - "OREGON/S25537/2023_OR_BE6502FF", # XXXX - "CAMBODIA/CCH230307/2023_7CC43803", # XXXX - "CAMBODIA/IAHC230253/2023_50A155E0", # XXXX - # 2024-08-06 - "VIRGINIA/H20-355-5372/2024_OR_53F1E92E", # XXXXX - "PANAMA/M273491/2024_OR_27E79A9B" # XXXXX - # 2024-0830 many changes - "COLORADO/150/2024_OR_2BD9D59A", - "COLORADO/150/2024_SIAT1_14B3FB98", - "MICHIGAN/113/2024_SIAT1_B0E6BEF3", - # 2024-0902 - "SASKATCHEWAN/RV03900/2024_OR_029CB62D", # long branch - "PANAMA/M273491/2024_OR_27E79A9B", # XXXX - # 2024-1004 - "CAMBODIA/NPH240499/2024_OR_F38419C8", #XXXX - # 2024-1025 - "CAMBODIA/NPH240575/2024_OR_33D3499A", - # 2024-1115 - "PARANA/44706/2021_OR_25BFEB40", - "SANTA_HELENA/PARANA_44706/2021_OR_25BFEB40", - # 2024-0111 (tree built 2024-1811) - "GREATER_ACCRA_REGION/FS-24-4832/2024_B7004939", - "GREATER_ACCRA_REGION/FS-24-4779/2024_1393BD1C", - # 2024-2611 - "QUEENSLAND/IN000643/2024_1ACA7E5A", - "QUEENSLAND/IN000245/2024_0854A342", - "WASHINGTON/S26082/2022_OR_7462BFDE", - "WASHINGTON/S26079/2022_OR_07B0CEEC", - "WASHINGTON/S26107/2023_OR_A56901B3", - "WASHINGTON/S26104/2022_OR_E80CAD50", - "WASHINGTON/S26097/2022_OR_40AE3B8C", - "WASHINGTON/S26077/2022_OR_C294BE90", - "WASHINGTON/S26098/2022_OR_1EDB1957", - "WASHINGTON/S26088/2022_OR_C6B3FAA9", - # 2024-0912 - "GALICIA/28220935/2023_C495745E", - "GALICIA/33341960/2023_0FF7D1C8", - # 2025-2401 - "TURKMENISTAN/784/2025_OR_F898A707", - # 2025-1202 - "COTE_DIVOIRE/4331/2024_OR_E0FF092D", - # 2025-1202 - "WISCONSIN/67/2005_XXMDCK2_8C52B46C", - # 2025-0716 (omitted exclusions in error, added 29/07 PR) - "INDIA/BHO_NIV25-827/2025_MDCK3_C54368E2", - "CAMBODIA/IAHC240402/2024_OR_35F1131A", - "INDIA/PUN_NIVCOV2572/2025_MDCK1_C9AF818D", - "ASTURIAS/232524600/2025_A4329A8F", - "OREL/SWINE-RII-WD2652S/2025_OR_57A8B37C", - "OREL/SWINE-RII-WD2687S/2025_OR_C384E32B", - "MEXICO/2496813201/2025_OR_E2853515", - # 2025-0723 - "NORWAY/6870/2025_OR_828769BE", - "BRITISH_COLUMBIA/RV00479/2025_SIAT2/SIAT1_0DA227AA", - "QUEBEC/240/2025_OR_8D4D59A8", - "GALICIA/52520233/2025_F71B58B8", - "BULGARIA/2506/2025_SIAT2/SIAT1_8B0BEF2D", - "BULGARIA/1941/2025_SIAT2/SIAT1_0D5E6F90", - "CAMBODIA/IBTB250007/2025_OR_4E893E81", - "ASTURIAS/232525925/2025_719B9A6C", - "BAVARIA/6511057/2025_OR_D759C9B5", - "MEXICO/2496813201/2025_OR_E2853515", - "CAMBODIA/SVH250074/2025_OR_2253D125", - "ALBANIA/310521/2025_SIAT1_5DCA4F3D", - "CAMBODIA/IAHC250174/2025_OR_6BFB38A8", - "FRANCE/ARA-HCL023004739801/2023_OR_003FC67E", - "SOUTH_AFRICA/NHLS-UCT-CERI-C067966/2025_EE268891", - "PERU/LIM-UPCH-4915/2025_1E953D52", - # 2025-0729 no exclusions - ]) + .find_masters() # to be able to use .filter below + .include_name( + [ + outgroup, + "VICTORIA/361/2011", # 2011-10-24 3C.3 + # [2020-09-07] Nearest common ancestor of 2a and 3a in /syn/eu/ac/results/eu/2019-0417-h3-trees/cdc-usa-2009-2019-250-per-year-1.rough-29532.tree.pdf https://notebooks.antigenic-cartography.org/eu/results/eu/2019-0417-h3-trees/ + "MARYLAND/30/2012", # 2012-07-26 3C.3 + # [2020-08-17] Derek thinks that the root (A/STOCKHOLM/6/2014) is not old enough + "TEXAS/50/2012", # 2012-04-15 3C.3 root in MELB tree 2020 + "VERMONT/6/2012", + "STOCKHOLM/6/2014", # 2014-02-06 3a + # [2020-02-07] intermediate strains from nextflu https://nextstrain.org/flu/seasonal/h3n2/ha/2y to keep 3a at the top + "SWITZERLAND/9715293/2013", # 2013-12-06 3a + "NORWAY/466/2014", # 2014-02-03 3a + "SOUTH_AUSTRALIA/55/2014", # 2014-06-29 3a + "TASMANIA/11/2014", # 2014-03-16 3a + "KOBE/63/2014", # 2014-05-21 3a + "PERU/27/2015", # 2015-04-13 3a + "NEVADA/22/2016", # 2016-03-05 3a + "IDAHO/33/2016", # 2016-06-08 3a + "TEXAS/88/2016", # 2016-02-25 3a + "TEXAS/71/2017", # 2017-03-18 3a + "BRAZIL/7331/2018", # 2018-07-09 3a + "KANSAS/14/2017", # 2017-12-14 3a, to have serum circles in the sig page + "HONG_KONG/4801/2014", # 2014-02-26 2a + "HAWAII/47/2014", # 2014-07-18 2a + "NORTH_CAROLINA/4/2017", # 2017-01-26 2a2 + "NORTH_CAROLINA/4/2016", # 2016-01-14 2a1 + "ANTANANARIVO/1067/2016", # 2016-04-06 2a1 + "HONG_KONG/2286/2017", # 2017-05-23 2a1b 135K + "WISCONSIN/327/2017", # 2017-09-22 2a1b 135K + "ABU_DHABI/240/2018", # 2018-01-01 2a1b 135K + "JAMAICA/1447/2018", # 2018-02-19 2a1b 131K + # Strains before and after T135N substitution to have a better 135N branch placement + # Sarah 2021-02-08 17:05 + "WISCONSIN/85/2016", + "SRI_LANKA/56/2017", + "SOUTH_CAROLINA/4/2017", + "YOKOHAMA/145/2017", + "INDIA/9930/2017", + "HONG_KONG/3118/2017", + "HAWAII/47/2014", + "NIIGATA-C/43/2015", + "DAKAR/17/2016", + # "CAMEROON/16V-9267/2016", # excluded, truncated sequence + "LAOS/3008/2016", + "YUNNAN_LINXIANG/1718/2016", + "HONG_KONG/2302/2016", + "ONTARIO/RV2414/2015", + "ONTARIO/RV2414/2015", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + ] + ) + .exclude_name( + [ + "SWINE/", + # long branch (2021-0622) + "WISCONSIN/1/2021", + # long branch (2021-1117) + "HAWAII/28/2020", + "SOUTH_AUSTRALIA/85/2018", + "SOUTH_AUSTRALIA/1/2021", + "MANITOBA/3/2021", + "GUANGXI_GANGBEI/190/2019", + # 2021-1217 potential long branch + "OHIO/6/2021_ISOLATE", + # 2022-0204 potential long branch + "FRANCE/ARA-HCL021219268801/2021", + "FRANCE/ARA-HCL021220304101/2021", + # 2022-0209 long branch, eight X, R in nuc + "ONTARIO/RV2002/2019_P1_0F1E8F1E", + # 2022-0415 potential long branch + "ACRE/181823-IEC/2021", + # 2022-0819 potential long branch + "CATALONIA/NSVH101883267/2022_OR_F79C7813", + # 2022-0825 long branch in the tree + "INDIANA/27/2018_SIAT1_26C3A2A4", + "SWINE/KENTUCKY/18TOSU1939/2018_26C3A2A4", + "SWINE/ILLINOIS/18TOSU1411/2018_26C3A2A4", + # 2022-0826 potential long branch + "FIJI/99/2022_OR_80B95BF8", + "FIJI/102/2022_OR_6AEB79D8", + "WEST_VIRGINIA/25/2022_SIAT1_3E592413", + "WEST_VIRGINIA/26/2022_SIAT1_3E592413", + "WEST_VIRGINIA/24/2022_SIAT1_2FDA4DA3", + # 2022-0902 potential long branch + "FRANCE/ARA-HCL022060609601/2022_65F4520B", + # 2022-10-07 potential long branch + "CATALONIA/NSVH101908168/2022_OR_D5C0176E", + # 2022-10-28 potential long branch + "MICHIGAN/48/2022_OR_910196F9", + # 2022-11-04 potential long branch + "NEW_MEXICO/19/2022_IRX_4BA11056", + # 2022-11-11 potential long branches + "CATALONIA/NSVH534131326/2022_OR_4AC47F3F", + "A/ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_MIXED_ECE1_DCC5FAD7", + "ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_ECE1_DCC5FAD7", + # 2022-11-26 potential long branches + "CATALONIA/NSVH101954411/2022_OR_7A633967", + "CATALONIA/NSVH510598655/2022_OR_57BCC58B", + # 2022-12-02 potential long branches + "MISSOURI/19/2022_OR_837C934E", + # 2022-12-10 potential long branches + "CATALONIA/NSVH560028269/2022_OR_9C189CFA", # 159X 160X 162X 163X 164X + "ATHENS/355/2022_D9D4A86A", # 38X 40X 41X 42X 59X 67X 81X 128X + "MICHIGAN/48/2022_OR_910196F9", + "MICHIGAN/UOM10047284024/2022_OR_910196F9", + # 2023-01-08 potential long branches + "BRITISH_COLUMBIA/PHL-480/2022_OR_EAAE7C01", + # 2023-01-13 potential long branch + "GALICIA/22034482/2022_2EE89148", + "GALICIA/22018524/2022_1EE46082", + "GALICIA/22033804/2022_0EE699D6", + "GALICIA/21987208/2022_F49E5748", + "GALICIA/22036079/2022_430E996A", + "GALICIA/22018743/2022_0116B017", + "GALICIA/22011636/2022_3D9F40AD", + # 2023-01-20 potential long branch + "SINGAPORE/GP9117/2022_OR_69761A51", + "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", + # 2023-01-28 potential long branches + "MICHIGAN/48/2022_OR_910196F9", + "MACEDONIA/IPH-MKD-59729/2021_OR_837D0C9D", + "KHON_KAEN/FS005/2022_OR_1A5637D8", + "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", + "ATHENS/355/2022_D9D4A86", + "CATALONIA/NSVH101994299/2023_OR_13A43DA9", + "MACEDONIA/IPH-MKD-HD/2021_OR_B52D1807", + "SOUTH_AFRICA/R07603/2022_OR_7937FE46", + "SOUTH_AFRICA/R05772/2022_OR_552B5F5E", + "GALICIA/22034482/2022_2EE89148", + "CATALONIA/NSVH560028269/2022_OR_9C189CFA", + # 2023-02-04 potential long branches + "MANITOBA/RV00728/2022_P1_5236154F", + "ONTARIO/198/2022_OR_D1EC3B40", + # 2023-04-21 potential long branches + "ROMANIA/493/2023_612606B9", + "PAKISTAN/1140/2023_OR_3D2D111B", + "CATALONIA/NSVH102045605/2023_OR_7490FFA8", + # XXX & long branch 2023-0731 + "BAHIA/PVM105707/2021_OR_BCE07E46", + "SOUTH_AFRICA/K056301/2023_30E0E639", + # XXXXX 2023-0811 CORRECTED 2023-1023 + "BAHIA/292038965/2021_70230607", + "BAHIA/292104308/2022_D4526D62", + # XXXXX 2023-0829 CORRECTED 2023-1023 + "SOUTH_AFRICA/PET28428/2023_7572E223", + "SOUTH_AFRICA/PET28193/2023_BF46BBB4", + "NORWAY/8084/2023_OR_4548A267", + # 2023-09-08 potential long branch ?why + "SOUTH_AFRICA/K056301/2023_CC1FE952" + # 2023-10-23 XXX + "PARA/CEVI_2200263/2021_OR_6D24213E", + "SAO_PAULO/CEVI_2200855/2022_OR_D4E825BF", + "SAO_PAULO/CEVI_2200871/2022_OR_280FCE18", + # 2023-10-28 + "SOUTH_AFRICA/K056301/2023_CC1FE952", # very divergent + # 2023-12-04 XXXX + "OMAN/CPHL_52323056/2023_OR_297A7E60", + "LUSAKA/1-NIC-332/2023_OR_BA018CE9", + "NORWAY/11072/2023_OR_EB1EA580", + # 2024-01-01 + "SAINT_PETERSBURG/RII-MH151473S/2023_OR_D20D279D", # XX + "SAINT_PETERSBURG/RII-4937S/2023_OR_95887A8C", # XX + "CANADA/NOVASCOTIA-32036/2022_A90EF453", # first 7 aa + "BRITISH_COLUMBIA/PHL-602/2022_OR_A1B8E845", # XX + # 2024-01-22 for 2024-01-16 file + "INDONESIA/BIOKES-IBKL1127/2023_OR_791937E7", # long branch + "SAO_PAULO/CEVI_2201055/2022_OR_BF9D3752", # XX + "ROMANIA/3227/2023_E20D207E", # XX + "SAO_PAULO/CEVI_2200903/2022_OR_BB30A339", # XX + "CACERES/50644480/2023_361FA786", # long branch + "BELGIUM/S1112/2023_OR_D7ABA060", # long branch + "BELGIUM/S0576/2023_OR_CEC5B896", # long branch + "BELGIUM/S0220/2023_OR_0B549607", # long branch + "BELGIUM/H0592/2022_OR_22BD830C", # long branch + # 2024-01-30 + "SPAIN/GA-HULA-615019/2023_B7077195", + "BELGIUM/S03941/2023_F4250630", + "NEW_YORK/392/2004_CFB2F75C", + "SPAIN/GA-HULA-615037/2023_E8183B9E", + "CACERES/59816379/2023_4D9F91F8", + "TURKEY/GIHSN_ANKARA-140111_950/2023_E0D8B88D", + # 2024-02-11 XXXX Indonesia is long branch + "KENYA/AFI-KDH-1820/2023_B3509344", + "INDONESIA/BIOKES-IDSW0063/2023_OR_5962326E", + "KENYA/AFI-ISB-1107/2023_E5F19B22", + "KENYA/AFI-MNB-1100/2023_6D42271", + # 2024-0703 + "CHIANG_RAI/P2287/2024_OR_2744AA9C", # 3 deletions + # 2024-0727 + "SASKATCHEWAN/RV03900/2024_OR_029CB62D", # 198 diffs + "WASHINGTON/S25532/2024_OR_13A107A6", # XXXX + "OREGON/S25529/2024_OR_5502D106", # XXXX + "OREGON/S25537/2023_OR_BE6502FF", # XXXX + "CAMBODIA/CCH230307/2023_7CC43803", # XXXX + "CAMBODIA/IAHC230253/2023_50A155E0", # XXXX + # 2024-08-06 + "VIRGINIA/H20-355-5372/2024_OR_53F1E92E", # XXXXX + "PANAMA/M273491/2024_OR_27E79A9B" # XXXXX + # 2024-0830 many changes + "COLORADO/150/2024_OR_2BD9D59A", + "COLORADO/150/2024_SIAT1_14B3FB98", + "MICHIGAN/113/2024_SIAT1_B0E6BEF3", + # 2024-0902 + "SASKATCHEWAN/RV03900/2024_OR_029CB62D", # long branch + "PANAMA/M273491/2024_OR_27E79A9B", # XXXX + # 2024-1004 + "CAMBODIA/NPH240499/2024_OR_F38419C8", # XXXX + # 2024-1025 + "CAMBODIA/NPH240575/2024_OR_33D3499A", + # 2024-1115 + "PARANA/44706/2021_OR_25BFEB40", + "SANTA_HELENA/PARANA_44706/2021_OR_25BFEB40", + # 2024-0111 (tree built 2024-1811) + "GREATER_ACCRA_REGION/FS-24-4832/2024_B7004939", + "GREATER_ACCRA_REGION/FS-24-4779/2024_1393BD1C", + # 2024-2611 + "QUEENSLAND/IN000643/2024_1ACA7E5A", + "QUEENSLAND/IN000245/2024_0854A342", + "WASHINGTON/S26082/2022_OR_7462BFDE", + "WASHINGTON/S26079/2022_OR_07B0CEEC", + "WASHINGTON/S26107/2023_OR_A56901B3", + "WASHINGTON/S26104/2022_OR_E80CAD50", + "WASHINGTON/S26097/2022_OR_40AE3B8C", + "WASHINGTON/S26077/2022_OR_C294BE90", + "WASHINGTON/S26098/2022_OR_1EDB1957", + "WASHINGTON/S26088/2022_OR_C6B3FAA9", + # 2024-0912 + "GALICIA/28220935/2023_C495745E", + "GALICIA/33341960/2023_0FF7D1C8", + # 2025-2401 + "TURKMENISTAN/784/2025_OR_F898A707", + # 2025-1202 + "COTE_DIVOIRE/4331/2024_OR_E0FF092D", + # 2025-1202 + "WISCONSIN/67/2005_XXMDCK2_8C52B46C", + # 2025-0716 (omitted exclusions in error, added 29/07 PR) + "INDIA/BHO_NIV25-827/2025_MDCK3_C54368E2", + "CAMBODIA/IAHC240402/2024_OR_35F1131A", + "INDIA/PUN_NIVCOV2572/2025_MDCK1_C9AF818D", + "ASTURIAS/232524600/2025_A4329A8F", + "OREL/SWINE-RII-WD2652S/2025_OR_57A8B37C", + "OREL/SWINE-RII-WD2687S/2025_OR_C384E32B", + "MEXICO/2496813201/2025_OR_E2853515", + # 2025-0723 + "NORWAY/6870/2025_OR_828769BE", + "BRITISH_COLUMBIA/RV00479/2025_SIAT2/SIAT1_0DA227AA", + "QUEBEC/240/2025_OR_8D4D59A8", + "GALICIA/52520233/2025_F71B58B8", + "BULGARIA/2506/2025_SIAT2/SIAT1_8B0BEF2D", + "BULGARIA/1941/2025_SIAT2/SIAT1_0D5E6F90", + "CAMBODIA/IBTB250007/2025_OR_4E893E81", + "ASTURIAS/232525925/2025_719B9A6C", + "BAVARIA/6511057/2025_OR_D759C9B5", + "MEXICO/2496813201/2025_OR_E2853515", + "CAMBODIA/SVH250074/2025_OR_2253D125", + "ALBANIA/310521/2025_SIAT1_5DCA4F3D", + "CAMBODIA/IAHC250174/2025_OR_6BFB38A8", + "FRANCE/ARA-HCL023004739801/2023_OR_003FC67E", + "SOUTH_AFRICA/NHLS-UCT-CERI-C067966/2025_EE268891", + "PERU/LIM-UPCH-4915/2025_1E953D52", + # 2025-0729 no exclusions + # 2025-0818 + "INDIA/BHO-NIV25-1226/2025_MDCK1_2E99D391", + "INDIA/PUD-NIV25-1183/2025_MDCK4_1BD68829", + "INDIA/JAI-NIV25-1120/2025_MDCK1_9051692B", + "ALBANIA/310569/2025_SIAT1_A284C73A", + "SWINE/OHIO/A02858604/2025_OR_3F9C4EB8", + "SWINE/NORTH_CAROLINA/A02979483/2025_OR_9F18313D", + "SWINE/IOWA/A02858606/2025_OR_8367F95A", + "SWINE/IOWA/A02858602/2025_OR_6B07CDAC", + "SWINE/MINNESOTA/A02858601/2025_OR_8A30B201", + "SWINE/MINNESOTA/A02979412/2025_OR_1838EF26", + # 2025-0827 no exclusions + # 2025-0910 no exclusions + # 2025-0917 + "INDIA/BHO-NIV25-1226/2025_MDCK1_2E99D391", + "INDIA/PUD-NIV25-1183/2025_MDCK4_1BD68829", + "ALBANIA/310569/2025_SIAT1_A284C73A", + "INDIA/AS-RMRC-DIB-1189/2023_D09F02E4", + # 2025-1006 + "MEXICO/2578825701/2025_OR_B6857194", + "CAMPO_GRANDE/LEIAL6538/2025_OR_3083EF3A", + # 2025-1028 + "INDIA/DIB-NIV25-1297/2025_DEAC04B8", + "INDIA/PUN-NIV25-1199/2025_EA34056E", + "INDIA/DEL-NIV25-1587/2025_01FD20A7", + "INDIA/PUN-NIVCOV251684/2025_95A3CE8E", + "INDIA/PUD-NIV25-1176/2025_8B8D9E17", + "INDIA/LUK-NIV25-1336/2025_4C35C9DE", + "INDIA/DIB-NIV25-1309/2025_0D480D68", + "INDIA/LUK-NIV25-1344/2025_ED98BB3E", + "INDIA/PUD-NIV25-1341/2025_663D69FB", + "GALICIA/GA-CHUAC-478/2025_OR_AE004DEB", + # 2025-1112 + "ESPIRITO_SANTO/4988/2025_OR_C4DA8B91", + # 2025-1126 + "CAMBODIA/NPH250828/2025_OR_B75D636B", + "CAMBODIA/KVH250244/2025_OR_DB084CEF", + "CAMBODIA/ISVR250088/2025_OR_8696921D", + "CAMBODIA/KPH250212/2025_OR_0DF92A52", + "CAMBODIA/INPH250362/2025_OR_6FD76539", + "CAMBODIA/AHC250325/2025_OR_10F98EBA", + "CAMBODIA/NPH250817/2025_OR_DD4119F8", + "CAMBODIA/AHC250341/2025_OR_D6BA3933", + "CAMBODIA/IKPT250105/2025_OR_6BA23E98", + "KILIFI/MAT220593/2021_OR_E96D0DC0", + "MINAS_GERAIS/CEVI_2501233/2025_OR_ACDD91A4", + ] + ) .remove_hash_duplicates() .replace_with_master() .sort("+date") @@ -649,41 +846,61 @@ def h3(subtype: str, args: argparse.Namespace): report(subtype, selected, expected_nuc_size=1650, limit=0) export(subtype, selected) + # ---------------------------------------------------------------------- sSubtypeToFunc = {"BVIC": bvic, "BYAM": byam, "A(H1N1)": h1, "A(H3N2)": h3} # ---------------------------------------------------------------------- -def export(subtype: str, selected :ae_backend.seqdb.Selected): - fasta.write(filename_or_stream=subtype_dir(subtype).joinpath("source.fas"), selected=selected, aa=False) + +def export(subtype: str, selected: ae_backend.seqdb.Selected): + fasta.write( + filename_or_stream=subtype_dir(subtype).joinpath("source.fas"), + selected=selected, + aa=False, + ) + # ---------------------------------------------------------------------- + def subtype_dir(subtype: str): dir = Path(sSubdirforSubtype[subtype]) dir.mkdir(exist_ok=True) return dir + # ---------------------------------------------------------------------- -def report(subtype: str, selected :ae_backend.seqdb.Selected, expected_nuc_size: int, limit: int = 20): + +def report( + subtype: str, + selected: ae_backend.seqdb.Selected, + expected_nuc_size: int, + limit: int = 20, +): print(f"{subtype:7s} {len(selected)}") for seq in selected: if len(seq.nuc) != expected_nuc_size: - print(f">> wrong size {len(seq.nuc)} of {seq.seq_id()} (expected: {expected_nuc_size}) ") + print( + f">> wrong size {len(seq.nuc)} of {seq.seq_id()} (expected: {expected_nuc_size}) " + ) for no, seq in enumerate(selected): if no >= limit: break # print(f"{no:4d} {seq.seq_id():40s} [{seq.date()}] {seq.aa}") print(f"{no:4d} {seq.seq_id():40s} [{seq.date()}]") + # ====================================================================== try: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("subtype", nargs="*", type=lambda src: src.upper()) - parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", default=False) + parser.add_argument( + "-v", "--verbose", dest="verbose", action="store_true", default=False + ) args = parser.parse_args() exit_code = main(args) or 0 except Exception as err: