diff --git a/.gitignore b/.gitignore index efcd6d3..12bb8e5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ build .cache .ccls-cache __pycache__ +/.vscode diff --git a/R/collapse_short_branches.R b/R/collapse_short_branches.R new file mode 100644 index 0000000..e1f868e --- /dev/null +++ b/R/collapse_short_branches.R @@ -0,0 +1,59 @@ +#! /usr/bin/env Rscript +main = function() { + message("COLLAPSE SHORT BRANCH LENGTHS") + + .libPaths( + c( + "/home/samt123/R/x86_64-pc-linux-gnu-library/4.2", + .libPaths() + ) + ) + + args <- commandArgs(trailingOnly = TRUE) + + inphy.txt = args[[1]] + tol = as.numeric(args[[2]]) + + # file names ------------------------------------------------------------- + inphy = readLines(inphy.txt, n = 1) + + outphy = paste0( + paste0( + fs::path_ext_remove(inphy), + "-col." + ), + fs::path_ext(inphy) + ) + + outphy.txt = paste0( + paste0( + fs::path_ext_remove(inphy.txt), + "-col." + ), + fs::path_ext(inphy.txt) + ) + + if (fs::file_exists(outphy) & fs::file_exists(outphy.txt)) { + message( + outphy, + " and ", + outphy.txt, + " already exist! Continuing without re-collapsing branches" + ) + return(0) + } + + message("Reading tree") + phy = ape::read.tree(file = inphy) + + message("Collapsing branches shorter than ", tol) + phy_col = ape::di2multi(phy, tol = tol) + message(ape::Nnode(phy), " -> ", ape::Nnode(phy_col)) + + message("Writing tree") + ape::write.tree(phy_col, file = outphy) + message("Writing ", outphy.txt) + writeLines(outphy, outphy.txt) +} + +main() diff --git a/R/rotate.R b/R/rotate.R new file mode 100644 index 0000000..e2d7d86 --- /dev/null +++ b/R/rotate.R @@ -0,0 +1,71 @@ +#! /usr/bin/env Rscript +main = function() { + message("ROTATE TREE") + + .libPaths( + c( + "/home/samt123/R/x86_64-pc-linux-gnu-library/4.2", + .libPaths() + ) + ) + + args <- commandArgs(trailingOnly = TRUE) + + inphy.txt = args[[1]] + infasta = args[[2]] + + # file names + inphy = readLines(inphy.txt, n = 1) + + outphy = paste0( + paste0( + fs::path_ext_remove(inphy), + "-rot." + ), + fs::path_ext(inphy) + ) + + outphy.txt = paste0( + paste0( + fs::path_ext_remove(inphy.txt), + "-rot." + ), + fs::path_ext(inphy.txt) + ) + + if (fs::file_exists(outphy) & fs::file_exists(outphy.txt)) { + message( + outphy, + " and ", + outphy.txt, + " already exist! Continuing without re-rotating tree." + ) + return(0) + } + + message("Reading fasta") + fa = ape::read.dna( + file = infasta, + format = "fasta", + as.character = T, + as.matrix = F + ) + + outgroup = names(fa)[[1]] + + message("Reading ", inphy) + t = ape::read.tree(file = inphy) + + message("Rooting tree at ", outgroup) + t = ape::root(t, outgroup = outgroup) + message("Ladderizing tree") + t = ape::ladderize(t, right = F) + + message("Writing tree") + ape::write.tree(t, file = outphy) + message("Writing ", outphy.txt) + writeLines(outphy, outphy.txt) + return(0) +} + +main() diff --git a/R/set_min_branch_lengths.R b/R/set_min_branch_lengths.R new file mode 100755 index 0000000..764afaa --- /dev/null +++ b/R/set_min_branch_lengths.R @@ -0,0 +1,33 @@ +#! /usr/bin/env Rscript + +main = function() { + message("SET MINIMUM BRANCH LENGTHS") + + .libPaths( + c( + "/home/samt123/R/x86_64-pc-linux-gnu-library/4.2", + .libPaths() + ) + ) + + args <- commandArgs(trailingOnly = TRUE) + + infile = args[[1]] + outfile = args[[2]] + min_bl = as.numeric(args[[3]]) + + if (fs::file_exists(outfile)) { + message(outfile, " already exists. Not remaking.") + } + + message("Reading tree") + t = ape::read.tree(file = infile) + + message("Setting minimum branch lengths to ", min_bl) + t$edge.length[t$edge.length < min_bl] = min_bl + + message("Writing tree") + ape::write.tree(phy = t, file = outfile) +} + +main() diff --git a/R/tree_trim.R b/R/tree_trim.R new file mode 100644 index 0000000..723760f --- /dev/null +++ b/R/tree_trim.R @@ -0,0 +1,171 @@ +#! /usr/bin/env Rscript + +main = function() { + message("TRIM TREE") + + .libPaths( + c( + "/home/samt123/R/x86_64-pc-linux-gnu-library/4.2", + .libPaths() + ) + ) + + library(seqUtils, quietly = T, warn.conflicts = F, verbose = F) + library(convergence, quietly = T, warn.conflicts = F, verbose = F) + + library(treeio, quietly = T, warn.conflicts = F, verbose = F, ) + + library(purrr, quietly = T, warn.conflicts = F, verbose = F) + library(dplyr, quietly = T, warn.conflicts = F, verbose = F) + library(stringr, quietly = T, warn.conflicts = F, verbose = F) + + convergence:::add_to_PATH("/home/sat65/miniforge3/envs/treebuild/bin") # usher + + # args ------------------------------------------------------------ + args <- commandArgs(trailingOnly = TRUE) + + subtype = args[[1]] + inphy.txt = args[[2]] + infasta = args[[3]] + + inphy = readLines(inphy.txt, n = 1) + + outphy = paste0( + paste0( + fs::path_ext_remove(inphy), + "-trim." + ), + fs::path_ext(inphy) + ) + + outphy.txt = paste0( + paste0( + fs::path_ext_remove(inphy.txt), + "-trim." + ), + fs::path_ext(inphy.txt) + ) + + outfasta = paste0( + paste0( + fs::path_ext_remove(infasta), + "-trim." + ), + fs::path_ext(infasta) + ) + + if (all(fs::file_exists(c(outphy, outphy.txt, outfasta)))) { + message("All output files already exist. Not remaking.") + } + + # specify trim ---------------------------------------------------- + if (subtype == "h3") { + trim = T + mutation = "223V" + required_mutations = list() # list(list(at = 100, to = "T"), list(at = ...)) + forbidden_mutations = list() + min_tips = 15000 + } else if (subtype == "h1") { + trim = F + mutation = NA + required_mutations = list() + forbidden_mutations = list() + min_tips = 0 + } else if (subtype == "bvic") { + trim = T + mutation = "127T" + required_mutations = list() + forbidden_mutations = list() + min_tips = 15000 + } else if (subtype == "byam") { + trim = F + mutation = NA + required_mutations = list() + forbidden_mutations = list() + min_tips = 0 + } else { + stop("Invalid subtype ", subtype) + } + + # read input files ------------------------------------------------------------ + + phy = ape::read.tree(file = inphy) + fa = seqUtils::fast_fasta(infasta) + + # trim ------------------------------------------------------------ + if (!trim) { + trimmed_phy = phy + trimmed_seqs = fa + } else { + tree_and_sequences = list( + tree = phy, + sequences = tibble::tibble( + Isolate_unique_identifier = names(fa), + dna_sequence = seqUtils::clean_sequences(unname(fa), type = "nt") + ) + ) + + usher_tree_and_sequences = convergence::addASRusher( + tree_and_sequences, + nuc_ref = fa[1], + aa_ref = as.character(Biostrings::translate(Biostrings::DNAString(fa[1]))) + ) + + usher_tree_and_sequences$tree_tibble$nd = c( + rep(1, ape::Ntip(usher_tree_and_sequences$tree)), + castor::count_tips_per_node(usher_tree_and_sequences$tree) + ) + + tree_tibble_mutation_occs = usher_tree_and_sequences$tree_tibble %>% + filter( + map_lgl(aa_mutations_nonsyn, ~mutation %in% str_sub(.x, 2)), + nd >= min_tips + ) %>% + arrange(-nd) + + for (r_m in required_mutations){ + at = as.integer(r_m[["at"]]) + to = r_m[["to"]] + tree_tibble_mutation_occs = tree_tibble_mutation_occs %>% + filter(substr(reconstructed_aa_sequence, at, at) == to) + } + + for (f_m in required_mutations){ + at = as.integer(f_m[["at"]]) + to = f_m[["to"]] + tree_tibble_mutation_occs = tree_tibble_mutation_occs %>% + filter(substr(reconstructed_aa_sequence, at, at) != to) + } + + if (nrow(tree_tibble_mutation_occs) != 1){ + stop("No tree branches fulfilling specification found ") + } + + ancestor = tree_tibble_mutation_occs$node[[1]] + + trimmed_phy = castor::get_subtree_at_node( + usher_tree_and_sequences$tree, + ancestor - ape::Ntip(usher_tree_and_sequences$tree) + )$subtree + + trimmed_phy = ape::ladderize(trimmed_phy, right = F) + trimmed_phy$edge.length = trimmed_phy$edge.length / mean(nchar(fa)) # per nt + trimmed_seqs = fa[names(fa) %in% trimmed_phy$tip.label] + } + + message("Writing tree") + ape::write.tree(trimmed_phy, file = outphy) + + message("Writing ", outphy.txt) + writeLines(outphy, outphy.txt) + + message("Writing ", outfasta) + seqUtils::write_fast_fasta( + seqs = unname(trimmed_seqs), + names = names(trimmed_seqs), + path = outfasta + ) + return(0) +} + +main() diff --git a/README.md b/README.md index 5afdaa1..a8f15e6 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,6 @@ - install homebrew https://brew.sh - brew install llvm ninja meson libomp cmake brotli zlib xz gnu-time catch2 -- pip3 install mypy ## Installing dependencies on Ubuntu diff --git a/bin/seqdb-update b/bin/seqdb-update index db670c9..ab0fdde 100755 --- a/bin/seqdb-update +++ b/bin/seqdb-update @@ -39,8 +39,6 @@ def main(args: argparse.Namespace): # only aligned are stored ae_backend.raw_sequence.calculate_hash(sequence) sequences_by_subtype.setdefault(sequence.type_subtype, []).append(sequence) - else: - print(f"{sequence.name:40s} {sequence.aa}") except Exception as err: print(f"> {err}: {sequence.name}\n {sequence.aa}") reader.context.messages_from_backend(sequence_messages) diff --git a/cc/tree/newick.cc b/cc/tree/newick.cc index 85ff59d..dd4f1f3 100644 --- a/cc/tree/newick.cc +++ b/cc/tree/newick.cc @@ -174,7 +174,7 @@ namespace ae::tree::newick { static constexpr auto whitespace = dsl::ascii::space / dsl::ascii::newline; static constexpr auto rule = dsl::p + dsl::semicolon + dsl::eof; - static constexpr auto max_recursion_depth = 1000; + static constexpr auto max_recursion_depth = 10000; static constexpr auto value = lexy::forward; }; diff --git a/cc/whocc/xlsx/sheet-extractor.cc b/cc/whocc/xlsx/sheet-extractor.cc index 836de0f..9b9bfea 100644 --- a/cc/whocc/xlsx/sheet-extractor.cc +++ b/cc/whocc/xlsx/sheet-extractor.cc @@ -96,7 +96,7 @@ static const std::regex re_VIDRL_serum_name{"^(?:[AB]/)?([A-Z][A-Z ]+)/?([0-9]+) static const std::regex re_VIDRL_serum_id{"^(" pattern_VIDRL_serum_id "|" pattern_CRICK_serum_id ")$", regex_icase}; static const std::regex re_VIDRL_serum_id_with_days{"^[AF][0-9][0-9][0-9][0-9]-[0-9]+D$", regex_icase}; -static const std::regex re_human_who_serum{R"(^\s*(.*(HUMAN|WHO|NORMAL)|GOAT|POST? VAX|VAX POOL|SERA POOL)\b)", regex_icase}; // "POST VAX": VIDRL H3 HI 2021, "HUMAN VAX": VIDRL H3 HI 2022 +static const std::regex re_human_who_serum{R"(^\s*(.*(HUMAN|WHO|NORMAL)|GOAT|POST? VAX|VAX POOL)\b)", regex_icase}; // "POST VAX": VIDRL H3 HI 2021, "HUMAN VAX": VIDRL H3 HI 2022 #pragma GCC diagnostic pop @@ -339,16 +339,13 @@ std::string ae::xlsx::v1::Extractor::titer(size_t ag_no, size_t sr_no) const void ae::xlsx::v1::Extractor::find_titers(warn_if_not_found winf) { std::vector>> rows; - AD_DEBUG("Sheet {} rows:{}", sheet().name(), sheet().number_of_rows()); + // AD_DEBUG("Sheet {}", sheet().name()); for (nrow_t row{0}; row < sheet().number_of_rows(); ++row) { auto titers = sheet().titer_range(row); - AD_DEBUG("Row:{} titers: ({} valid:{}) {}:{}", row, titers.length(), titers.valid(), titers.first, titers.second); adjust_titer_range(row, titers); - AD_DEBUG("Row:{} titers: ({} valid:{}) {}:{}", row, titers.length(), titers.valid(), titers.first, titers.second); if (titers.valid() && titers.length() > 2 && titers.first > ncol_t{0} && valid_titer_row(row, titers)) rows.emplace_back(row, std::move(titers)); } - AD_DEBUG("Rows with titers: {}", rows.size()); if (!ranges::all_of(rows, [&rows](const auto& en) { return en.second == rows[0].second; })) { fmt::memory_buffer report; // fmt::format(rows, "{}", "\n ")); diff --git a/proj/weekly-tree/export b/proj/weekly-tree/export index 580e904..bc61c17 100755 --- a/proj/weekly-tree/export +++ b/proj/weekly-tree/export @@ -2,23 +2,50 @@ import sys, os, argparse, traceback from pathlib import Path + sys.path[:0] = [str(Path(os.environ["AE_ROOT"], dir)) for dir in ["build", "py"]] import ae_backend from ae.sequences.source import fasta # ====================================================================== -sSubtypeForSeqdb = {"B": "B", "BVIC": "B", "BYAM": "B", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} -sSubtypeNorm = {"BV": "BVIC", "BVIC": "BVIC", "BY": "BYAM", "BYAM": "BYAM", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubtypeForSeqdb = { + "B": "B", + "BVIC": "B", + "BYAM": "B", + "A(H1N1)": "A(H1N1)", + "H1": "A(H1N1)", + "1": "A(H1N1)", + "A(H3N2)": "A(H3N2)", + "H3": "A(H3N2)", + "3": "A(H3N2)", +} +sSubtypeNorm = { + "BV": "BVIC", + "BVIC": "BVIC", + "BY": "BYAM", + "BYAM": "BYAM", + "A(H1N1)": "A(H1N1)", + "H1": "A(H1N1)", + "1": "A(H1N1)", + "A(H3N2)": "A(H3N2)", + "H3": "A(H3N2)", + "3": "A(H3N2)", +} sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": "h3"} + def main(args: argparse.Namespace): - subtypes = [sSubtypeNorm[st] for st in (args.subtype or ["BVIC", "BYAM", "H1", "H3"])] + subtypes = [ + sSubtypeNorm[st] for st in (args.subtype or ["BVIC", "BYAM", "H1", "H3"]) + ] for subtype in subtypes: sSubtypeToFunc[subtype](subtype=subtype, args=args) + # ---------------------------------------------------------------------- + def bvic(subtype: str, args: argparse.Namespace): outgroup = "VICTORIA/830/2013" selected = ( @@ -28,16 +55,116 @@ def bvic(subtype: str, args: argparse.Namespace): .lineage("VICTORIA") .exclude_with_issue() .filter_dates(first="2016-06") - .find_masters() # to be able to use .filter below - .exclude(lambda ref: ref.aa["165-"]) # exclude 6-del and MIE/1/2019 with garbage in the middle - .exclude_name([ - "ANHUI_JINGHU/63/2019", # garbage in 406-414 leading to a long branch (2021-0722) - "LISBOA/NISU182_17-18/2018", # yamagata, long branch 2021-1117 - # "ZAMBIA/503/2021", # potential long branch 2022-0401 - # 2023-01-28 potential long branch - "KRABI/FS002/2022", - ]) - .include_name(outgroup) # outgroup BRISBANE/60/2008, B/CHONGQING BANAN/1840/2017 (V1A) + .find_masters() # to be able to use .filter below + .exclude( + lambda ref: ref.aa["165-"] + ) # exclude 6-del and MIE/1/2019 with garbage in the middle + .exclude_name( + [ + "ANHUI_JINGHU/63/2019", # garbage in 406-414 leading to a long branch (2021-0722) + "LISBOA/NISU182_17-18/2018", # yamagata, long branch 2021-1117 + # "ZAMBIA/503/2021", # potential long branch 2022-0401 + # 2023-01-28 potential long branch + "KRABI/FS002/2022", + "CATALONIA/NSVH151199301/2023_OR_0ACCE47F", # XXXX 2023-0421 + "SOUTH_AFRICA/R10785/2022_OR_0EB8BBEA", # XXX 2023-0515 + "SOUTH_AFRICA/R10907/2022_OR_BA5B358C", + "SOUTH_AFRICA/R09298/2022_OR_1A5C630A", + "SOUTH_AFRICA/R10914/2022_OR_ADA6D682", + "SOUTH_AFRICA/R10638/2022_OR_5EAB4CBB", + "SOUTH_AFRICA/R11310/2022_OR_12125C84", + "SOUTH_AFRICA/R11575/2022_OR_FED84F4F", + "SOUTH_AFRICA/R08092/2022_OR_93DDF2B2", + # XXX 2023-0602 + "LA_REUNION/HCL023041635201/2022_103E7754", + "ROMANIA/550950/2023_MDCK-SIAT_E0DECCD2", + "SAINT_PETERSBURG/RII-10866S/2023_OR_477275F7", + # XXX2023-0629 + "ABU_DHABI/4030/2022_AEC2DE48", + "RIO_GRANDE_DO_SUL/28773/2022_455F4FFF", + # 2023-0731 + "MALAYSIA/IMR-SARI0563/2023_25FE3E0E", + # 2023-0811 Yam + "KENYA/KCH157321/2019_OR_308AC34A", + # 2023-0822 XXX + "ALAGOAS/5742/2023_OR_441065AD", + "ALAGOAS/4659/2023_OR_558AC3D6", + "PARANA/5012/2023_OR_D54659DC", + # 2023-1005 XXXX + "UNITED_KINGDOM/UO-476/2023", + # 2024-0130 + "SANTA_CATARINA/10748/2023_OR_7B192215", + # 2024-0703 + "MADRID/HULP-200063972/2024_C98ADFD1", # unknown at deletion positions 162-164 + "MADRID/HULP-200041842/2024_5D4B7141", # unknown at deletion positions 162-164 + # 2024-0718 + "AUSTRIA/MUW1681066/2024_OR_3D9C21FB", # XXXX + "ATHENS/ILI411/2024_OR_556F1400", # XXXX + # 2024-0411 + "PAIS_VASCO/1919/2024_OR_1D682639", + "GALICIA/52314290/2024_AF2005E4", + # 2025-2411 + "MADAGASCAR/3422/2024_OR_DF3D589F", + "MEXICO/2429277701/2024_OR_3BF2F243", + "GALICIA/GA-CHUAC-250/2025_OR_AC92C5F3", + # 2025-1202 + "PAKISTAN/NIC-JHL/2010_OR_EC56A7FD", + "PAKISTAN/NIC-ICT-284/2025_OR_8DAE4B6A", + "PAKISTAN/NIC-ICT-218/2025_OR_362FAB6C", + "PAKISTAN/NMC-12/2025_OR_7B217EEC", + "PAKISTAN/NIC-ICT-190/2025_OR_D4D6D601", + "ANDALUCIA/PMC-00528/2025_OR_549D6405", + # 2025-0313 + "GALICIA/52463655/2025_0971E6C6", + # 2025-0507 + "THUERINGEN/31/2025_OR_758A40BD", + "FRANCE/ARA-HCL025053223301/2025_OR_1C544F25", + "ABAY/NRL-2330/2025_C2E364F7", + "RIO_DE_JANEIRO/533/2025_OR_C04A6642", + "LA_REUNION/HCL023041635201/2022_OR_103E7754", + "ASTURIAS/232465739/2024_EA8BEFAB", + "UKRAINE/3471/2025_6E494813", + # 2025-0521 + "GALICIA/52475765/2025_F76C5888", + # 2025-0611 no exclusions + # 2025-0618 no exclusions + # 2025-0625 no exclusions + # 2025-0702 + "PHUKET/P1187/2025_5A09E7D5", + # 2025-0716 (omitted exclusions in error,later added in 0723) + # 2025-0723 + "ODISHA/1421/2023_6422458A", + # 2025-0729 no exclusions + # 2025-0818 + "INDIA/PUD-NIV25-1188/2025_MDCK3_935FB615", + # 2025-0827 no exclusions + # 2025-0910 no exclusions + # 2025-0917 + "INDIA/PUD-NIV25-1188/2025_MDCK3_935FB615", + "AVILA/227/2024_FLU0789_03C986A9", + "AVILA/43/2025_FLU0874_624F5516", + "SALAMANCA/214/2024_FLU0771_DF1DA731", + "LA_REUNION/CHU725061509401/2025_7A8734BB", + # 2025-1006 + "MEXICO/240450000409/2024_OR_1174529C", + # 2025-1028 no exclusions + # 2025-1112 + "SANTA_CATARINA/4844/2025_OR_8B4C49A1", + "TOULON/INF_20/2023_OR_06B04496", + "TOULON/INF_21/2023_OR_C17020D2", + "SANTA_CATARINA/4290/2025_OR_27BF4310", + "INDIA/RES-23-4646/2023_AB3871EB", + "INDIA/RES-23-1799/2023_840D4617", + "INDIA/RES-23-2404/2023_159F0FCE", + "INDIA/RES-23-2414/2023_A4EA11A3", + # 2025-1126 + "ODISHA/956/2023_9DD47677", + "B/LEE/40_6BCD57B7", + ] + ) + .include_name( + outgroup + ) # outgroup BRISBANE/60/2008, B/CHONGQING BANAN/1840/2017 (V1A) .remove_hash_duplicates() .replace_with_master() .check() @@ -47,8 +174,10 @@ def bvic(subtype: str, args: argparse.Namespace): report(subtype, selected, expected_nuc_size=1710, limit=0) export(subtype, selected) + # ---------------------------------------------------------------------- + def byam(subtype: str, args: argparse.Namespace): outgroup = "MINNESOTA/2/2014" selected = ( @@ -58,9 +187,15 @@ def byam(subtype: str, args: argparse.Namespace): .lineage("YAMAGATA") .exclude_with_issue() .filter_dates(first="2015") - .find_masters() # to be able to use .filter below - .include_name([outgroup]) # , "IDAHO/1/2014", "MASSACHUSETTS/7/2014"]) - .exclude_name([lin for lin in (line.strip() for line in Path("byam-exclude.txt").open()) if lin and lin[0] != "#"]) + .find_masters() # to be able to use .filter below + .include_name([outgroup]) # , "IDAHO/1/2014", "MASSACHUSETTS/7/2014"]) + .exclude_name( + [ + lin + for lin in (line.strip() for line in Path("byam-exclude.txt").open()) + if lin and lin[0] != "#" + ] + ) .remove_hash_duplicates() .replace_with_master() .sort("+date") @@ -69,15 +204,21 @@ def byam(subtype: str, args: argparse.Namespace): report(subtype, selected, expected_nuc_size=1710, limit=0) export(subtype, selected) + # ---------------------------------------------------------------------- + def h1(subtype: str, args: argparse.Namespace): h1_include_txt = Path("h1-include.txt") if not h1_include_txt.exists(): h1_include_txt = Path(sys.argv[0]).parent.joinpath(h1_include_txt.name) if not h1_include_txt.exists(): raise RuntimeError("> cannot find {h1_include_txt}") - to_include = [lin for lin in (line.strip() for line in h1_include_txt.open()) if lin and lin[0] != "#"] + to_include = [ + lin + for lin in (line.strip() for line in h1_include_txt.open()) + if lin and lin[0] != "#" + ] # to_include = to_include[:-500] outgroup = "SOUTH_AUSTRALIA/30/2013" selected = ( @@ -86,86 +227,294 @@ def h1(subtype: str, args: argparse.Namespace): .human() .exclude_with_issue() .filter_dates(first="2019") - .find_masters() # to be able to use .filter below - .include_name([ - outgroup, - "MICHIGAN/45/2015", - "BRAZIL/5026/2017", # # 183S, trying to show "the different 183P clades has S183P independently", inferred from 2018-0924-ssm tree - ] + to_include) # ~/AD/share/conf/h1.183S.2017-2018.seqids - .exclude_name([ - "SWINE/", - # 2020-08-10 Sarah: They're not sequenced by a CC or NIC, but by a company. The sequences don't have obvious errors, but are very different (obviously) from the rest. My feeling is they're either variant viruses (from swine) or a sequencing error. - "ANKARA/14015-724/2019", - "ANKARA/14017-004/2019", - "ANKARA/14015-736/2019", - # 2021-06-22 deletion at the end - # "BADEN-WURTTEMBERG/1/2020", - # long branch 2021-0622 - "DENMARK/1/2021", - "MANITOBA/2/2021", - # 2021-1117 long branches - "MECKLENBURG-VORPOMMERN/1/2021", - "MECKLENBURG-VORPOMMERN/1-A/2021", # refers to MECKLENBURG-VORPOMMERN/1/2021 - "NETHERLANDS/10370-1/2020", - "NETHERLANDS/10370-1A/2020", - "NETHERLANDS/10370-2/2020", # refers to NETHERLANDS/10370-1A/2020 - "NETHERLANDS/GENT-193/2019", - "HESSEN/47/2020", - "HUBEI WUJIAGANG/1324/2020", - "HEBEI HAIGANG/SWL1572/2019", - "SHANDONG/204/2021", - "YUNNAN MENGZI/1462/2020", - "NORTH CAROLINA/15/2020", - "WISCONSIN/3/2021", - "PARANA/10835/2021", - # 2021-1125 long branches - "GANSU_XIFENG/1194/2021", - "GANSU_XIFENG/1143/2021", - "SICHUAN/1208/2021", - "TIANJIN/30/2020", - "WISCONSIN/5/2021", - "WISCONSIN/4/2021", - "NORTH_DAKOTA/12226/2021", - # 2021-1217 potential long branch - "IOWA/1/2021", # --> SWINE/MISSOURI/A01104146/2020 (base), SWINE/MISSOURI/A02525160/2021 - "IOWA/6/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" - "NORTH DAKOTA/12226/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" - # 2021-1217 long branches - "NETHERLANDS/10370-1A/2020", - # 2022-0131 potential long branch - "DENMARK/36/2021", - # 2022-04-26 mess in 156-179 - "INDIA/NIV-NICOBAR91/2020_OR_01A87F80", - "INDIA/NIV-NICOBAR163/2020_OR_823B2A0F", - "INDIA/NIV-NICOBAR106/2020_OR_B81F1DC2", - "INDIA/NIV-NICOBAR164/2020_OR_37A11133", - # 2022-10-07 potential long branch - "PARANA/20675/2022_OR_3D8D671C", - # 2022-12-10 potential long branch - "DAKAR/3124/2021_279E04A6", - # 2022-12-23 potential long branch - "NAVARRA/4050/2022_OR_BBC31B3A", - # 2023-01-13 potential long branch - "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", - # 2023-01-28 potential long branches - "NAVARRA/4050/2022_OR_BBC31B3A", - "DAKAR/3124/2021_279E04A6 pdm09-like", - "PAKISTAN/GIHSN-GB-445/2022_OR_A20D55B6", - "PAKISTAN/GIHSN-ICT-152/2023_OR_572EBEB8", - "PAKISTAN/GIHSN-PB-15/2023_OR_484A375D", - "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", - "CATALONIA/NSVH101996495/2023_OR_2E610047", - ]) + .find_masters() # to be able to use .filter below + .include_name( + [ + outgroup, + "MICHIGAN/45/2015", + "BRAZIL/5026/2017", # # 183S, trying to show "the different 183P clades has S183P independently", inferred from 2018-0924-ssm tree + ] + + to_include + ) # ~/AD/share/conf/h1.183S.2017-2018.seqids + .exclude_name( + [ + "SWINE/", + # 2020-08-10 Sarah: They're not sequenced by a CC or NIC, but by a company. The sequences don't have obvious errors, but are very different (obviously) from the rest. My feeling is they're either variant viruses (from swine) or a sequencing error. + "ANKARA/14015-724/2019", + "ANKARA/14017-004/2019", + "ANKARA/14015-736/2019", + # 2021-06-22 deletion at the end + # "BADEN-WURTTEMBERG/1/2020", + # long branch 2021-0622 + "DENMARK/1/2021", + "MANITOBA/2/2021", + # 2021-1117 long branches + "MECKLENBURG-VORPOMMERN/1/2021", + "MECKLENBURG-VORPOMMERN/1-A/2021", # refers to MECKLENBURG-VORPOMMERN/1/2021 + "NETHERLANDS/10370-1/2020", + "NETHERLANDS/10370-1A/2020", + "NETHERLANDS/10370-2/2020", # refers to NETHERLANDS/10370-1A/2020 + "NETHERLANDS/GENT-193/2019", + "HESSEN/47/2020", + "HUBEI WUJIAGANG/1324/2020", + "HEBEI HAIGANG/SWL1572/2019", + "SHANDONG/204/2021", + "YUNNAN MENGZI/1462/2020", + "NORTH CAROLINA/15/2020", + "WISCONSIN/3/2021", + "PARANA/10835/2021", + # 2021-1125 long branches + "GANSU_XIFENG/1194/2021", + "GANSU_XIFENG/1143/2021", + "SICHUAN/1208/2021", + "TIANJIN/30/2020", + "WISCONSIN/5/2021", + "WISCONSIN/4/2021", + "NORTH_DAKOTA/12226/2021", + # 2021-1217 potential long branch + "IOWA/1/2021", # --> SWINE/MISSOURI/A01104146/2020 (base), SWINE/MISSOURI/A02525160/2021 + "IOWA/6/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" + "NORTH DAKOTA/12226/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" + # 2021-1217 long branches + "NETHERLANDS/10370-1A/2020", + # 2022-0131 potential long branch + "DENMARK/36/2021", + # 2022-04-26 mess in 156-179 + "INDIA/NIV-NICOBAR91/2020_OR_01A87F80", + "INDIA/NIV-NICOBAR163/2020_OR_823B2A0F", + "INDIA/NIV-NICOBAR106/2020_OR_B81F1DC2", + "INDIA/NIV-NICOBAR164/2020_OR_37A11133", + # 2022-10-07 potential long branch + "PARANA/20675/2022_OR_3D8D671C", + # 2022-12-10 potential long branch + "DAKAR/3124/2021_279E04A6", + # 2022-12-23 potential long branch + "NAVARRA/4050/2022_OR_BBC31B3A", + # 2023-01-13 potential long branch + "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", + # 2023-01-28 potential long branches + "NAVARRA/4050/2022_OR_BBC31B3A", + "DAKAR/3124/2021_279E04A6 pdm09-like", + "PAKISTAN/GIHSN-GB-445/2022_OR_A20D55B6", + "PAKISTAN/GIHSN-ICT-152/2023_OR_572EBEB8", + "PAKISTAN/GIHSN-PB-15/2023_OR_484A375D", + "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", + "CATALONIA/NSVH101996495/2023_OR_2E610047", + # 2023-02-04 potential long branch + "SWITZERLAND/69094/2022_OR_B3C15C9B", + # 2023-0421 potential long branches + "CATALONIA/NSVH510598655/2022_OR_57BCC58B", + "PAKISTAN/1465/2023_OR_1B7B0A1A", + "BADEN-WURTTEMBERG/159/2022_OR_6AB83E7E", + "PARANA/44706/2021_OR_25BFEB40", + # XXX 2023-0515 + "A/PAKISTAN/GIHSN/ICT/1838/2023_0867EDC7", + "A/PAKISTAN/GIHSN/ICT/1634/2023_OR_3D55578F", + "A/PAKISTAN/GIHSN/ICT/1798/2023_OR_C271B3C9", + "A/PAKISTAN/GIHSN/ICT/1682/2023_OR_FB7A0386", + # XXX 2023-0602 + "PAKISTAN/GIHSN-ICT2035/2023_OR_4E73FFF6", + # XXX 2023-0731 + "CYCLADES/215/2022_D164055B", + # long branch & XXXX 2023-0811 + "GREECE/SRIHER-001/2022_OR_E68E256A", + "SAUDI_ARABIA/40/2020_OR_9B1E74A4", + "SAUDI_ARABIA/39/2020_OR_DAE6A100", + # 2023-0822 XXXX + "PARANA/6405/2023_OR_9ACD4010", + "BAHIA/5785/2023_OR_CBFA332A", + # 2023-0903 very long branch + "NETHERLANDS/10534/2023_OR_6221D5BF", + # 2023-09-08 long branch in tree + "BADAJOZ/18570374/2023_58158069", + # 2023-09-15 XXXX + "ALGIERS/251/2023_OR_1A27768", + # 2023-10-23 XXX & odd start for SAO_PAULO + "SAO_PAULO/CEVI_2300134/2023_OR_BB923F69", + "CAMBODIA/KVH230151/2023_OR_1F665D9C", + "CAMBODIA/AHC230229/2023_OR_3241A3FC", + # 2023-20-28 + "BULGARIA/725/2023_8A6AF67F", # nonsense 1-29 + "CAMBODIA/CCH230213/2023_OR_BE937275", # XXXX + # 2023-12-04 + "BANGKOK/P5113/2023_OR_CB7624FD", # long branch + # 2024-01-01 + "SWITZERLAND/114/2023_MDCK1_A2D22DAA", # long branch + # 2024-01-30 + "CATALONIA/NSAV198289092/2023_OR_291AD7EDPARANA/13897/2023_OR_0A657176", + "BELGIUM/H00082/2023_56A80548", + "CACERES/59816382/2023_A1324011", + "SENEGAL/2098/2023_OR_2EC7BF97", + # 2024-0703 + "GALICIA/GA-CHUAC-98/2024_OR_988888E6", # 29 nuc differences, 19 nuc differences + "MEXICO/2271/2021_338E9C30", # run of differences, 35-38, 40-42 + "ALAGOAS/CEVI_2200198/2021_OR_0811FCC2", # 6 X + "MADRID/1832324191549/2024_OR_988BC424", # 9 X + "HUMAN/KENYA/5866/2023_3F1D8A0B", # 9X + # 2024-0718 + "SAO_PAULO/CEVI_24000816/2024_OR_BC9F903C", # XXXX + "SAO_PAULO/CEVI_24000889/2024_OR_C69399E2", # dels + # 2024-0727 + "LUXEMBOURG/LNS3963248/2024_04C1C479", # XXXX + "COSTA_RICA/INC-0552-822365/2024_OR_9E1DE43D", # XXXX + # 2024-08-06 + "CHANTHABURI/P2877/2024_OR_5B993EF6", # XXXXX + "SRI_LANKA/R4_BC75/2024_50C5FC20" # XXXXX + # 2024-0830 XXXX & many changes + "OHIO/251/2024_OR_30CB2DAF", + "HANOI/PF324163/2024_OR_402B095E", + "SOUTH_AFRICA/PET32788/2024_OR_47322CBB", + # 2024-0902 long branch + "YAOUNDE/24V-3413/2024_8196548B", + # 2024-1004 + "PHUKET/P3423/2024_OR_5E3C7D19", # XXXX + # 2024-1025 + "PARANA/10835/2021_OR_8025A28A", + "TOLEDO/PARANA_10835/2021_OR_8025A28A", + "ENGLAND/3640434/2024_OR_A126DDAB", + "CATALONIA/NSAV198309324/2024_OR_29D1522E", + # 2024-1126 + "OREGON/FLU-OHSU-242090049/2024_05BD35BD", + "OREGON/FLU-OHSU-242090044/2024_DEC6E084", + # 2024-1209 + "NAKHON_PHANOM/F2539/2024_OR_276AD388", + # 2025-0901 + "CAMBODIA/KPH240133/2024_OR_CD9DB776", + "ENGLAND/4820116/2024_OR_F5C11575", + # 2025-2401 + "GALICIA/GA-CHUAC-172/2024_OR_000994B0", + "ROMANIA/2920/2024_F73C0E92", + # 2025-1202 + "BALTIMORE/JH-173/2025_A2FCE291", + "CROATIA/HZJZ-8235/2025_170E7DD7", + "ENGLAND/340238/2025_OR_EA36F218", + "ENGLAND/121421/2024_OR_2094499C", + "ANDALUCIA/PMC-00542/2025_OR_BD2FE74E", + "PAKISTAN/NIC-ICT-576/2025_OR_6924EA8E", + "ANDALUCIA/PMC-00549/2025_OR_A6464885", + "CASTELLON/VAHNSI-GIHSN-02_0386/2025_OR_2E26A6D3", + # 2025-0313 + "QUEENSLAND/IN000908/2025_91CEAA08", + "FRANCE/HDF-RELAB-IPP01090/2025_OR_096FF1D6", + "CATALONIA/NSVH171785721/2025_OR_C896185E", + "KAMCHATKA/205-20V/2025_OR_68CABAFF", + # 2025-0507 + "ENGLAND/5100240/2024_OR_EB872B9D", + "CAMBODIA/CCH250029/2025_OR_A4964B9E", + "ASTURIAS/232523061/2025_58877E98", + "PAKISTAN/NIC_SD193/2025_OR_3A27DCA6", + "ESPIRITO_SANTO/475/2025_OR_4FB190FF", + "PAKISTAN/NIC_SD184/2025_OR_66AF2F25", + "PAKISTAN/NIC_JHL115/2025_OR_53A17665", + "GALICIA/22647653/2025_DE4A5E72", + "IRELAND/4321/2025_8EB775D5", + "ENGLAND/640250/2025_OR_01187600", + "PAKISTAN/NIC_NIH749/2025_OR_40914714", + "BELGIUM/S4050/2025_OR_E331B39E", + "GALICIA/GA-CHUAC-314/2025_OR_59908BA2", + "MURCIA/528/2025_OR_E7444F60", + "OREL/SWINE-RII-WD2678S/2025_OR_6C9DA738", + "OREL/SWINE-RII-WD2681S/2025_OR_B004CABD", + "VERMONT/UVM-0478/2022_OR_DFD37A87", + "VERMONT/UVM-1652/2022_OR_42E6DF65", + "FRANCE/NOR-IPP04189/2025_OR_88070E22", # caused tree failure + # 2025-0716 (omitted exclusions in error, added 29/07, some added 0723 PR) + "CAMBODIA/SVH250268/2025_OR_238D9DD5", + "CAMBODIA/IAHC250126/2025_OR_FDAC2F6D", + "CAMBODIA/IKCM250040/2025_OR_7BD02395", + "CAMBODIA/CCH250037/2025_OR_347A5F98", + # 2025-0723 + "NIZHNIY_NOVGOROD/RII-MH239577S/2025_OR_B6AF35B5", + "CAMBODIA/KVH250142/2025_OR_3F8690DC", + "CAMBODIA/IAHC250133/2025_OR_C34D3AC3", + "SURGUT/RII-MH237071S/2025_OR_3A47CC28", + "MOROCCO/1168/2025_OR_910D767C", + # 2025-0729 + "ARKHANGELSK/RII-MH246940S/2025_OR_78BF9CC7", + # 2025-0818 + "CATALONIA/NSVH161521968/2025_OR_A0FA1338", + "CAMBODIA/NPH250451/2025_OR_3F664AAE", + "CAMBODIA/IAHC250237/2025_OR_E4433FD6", + "VALENCIA/VAHNSI-GIHSN-09_0742/2025_OR_30A76211", + "SAO_PAULO/CEVI_2500217/2025_OR_A0D5251B", + "SAO_PAULO/CEVI_2500207/2025_OR_3C666E5E", + "HONG_KONG/F40/2024_OR_94A307F6", + # 2025-0827 + "KRASNOGORSK/RII-MH248987S/2025_OR_CA8C4660", + # 2025-0910 no exclusions + # 2025-0917 + "CATALONIA/NSVH161521968/2025_OR_A0FA1338", + "CAMBODIA/IAHC250237/2025_OR_E4433FD6", + "CAMBODIA/NPH250451/2025_OR_3F664AAE", + "VALENCIA/VAHNSI-GIHSN-09_0742/2025_OR_30A76211", + "SAO_PAULO/CEVI_2500207/2025_OR_3C666E5E", + "KRASNOGORSK/RII-MH248987S/2025_OR_CA8C4660", + "HONG_KONG/F40/2024_OR_94A307F6", + # 2025-1006 no exclusions + # 2025-1028 + "SENEGAL/IPD-2503/2025_OR_20CCF2D9", + "POLAND/D7325_10/2023_763600E9", + "NETHERLANDS/D6904_36/2022_66C9DF52", + "POLAND/D7325_7/2023_CE4D3423", + "NETHERLANDS/D7061_16/2022_8D08423F", + "SPAIN/D7228_1/2022_F78E9BF7", + "AUSTRIA/D7225_20/2023_12AC9B5C", + "SPAIN/D7228_40/2023_36844405", + "BELGIUM/D6840_21/2022_32C5D497", + "SPAIN/D7228_50/2023_99D2EC08", + "ITALY/D7434_7/2023_BC9D4B6B", + "PORTUGAL/D7272_2/2023_0ED2CAE8", + "GERMANY/D7225_5/2023_13A6321B", + "GERMANY/D7225_17/2023_E6C859C1", + "POLAND/D7325_11/2023_906FE020", + "NETHERLANDS/D6840_51/2022_E7F7E7FB", + "GERMANY/D7061_22/2023_33D65F89", + "POLAND/D6904_24/2022_AF273C3D", + "POLAND/D7325_23/2023_36B8A0E4", + "NETHERLANDS/D7061_27/2022_DCE07BA1", + "POLAND/D6904_5/2022_240EA095", + "POLAND/D6840_59/2022_95E58799", + "BELGIUM/D7157_61/2023_6A0D50EA", + "GERMANY/D7061_33/2022_6DD93351", + "SLOVENIA/D7378_1_1/2023_473636B1", + "GERMANY/D7061_8/2022_5C22171F", + "GERMANY/D7225_1/2023_6BCBB1AC", + "GERMANY/D7061_11/2023_EA3CB29E", + "GERMANY/D7061_9/2023_D612A4DA", + "BULGARIA/D6904_28/2022_74CF41D4", + "BULGARIA/D6904_25/2022_DA88ADA9", + "SPAIN/D7228_24/2023_F14A6077", + # 2025-1112 + "RIYADH/63/2021_OR_677764E9", + "RIYADH/65/2022_OR_B365D7CC", + "RIYADH/19/2021_OR_9E0E1562", + "RIYADH/88/2022_OR_8E98C98E", + "RIYADH/100/2022_OR_8B809E4C", + "RIYADH/111/2023_OR_886090A3", + "RIYADH/120/2023_OR_EA0E7626", + "RIYADH/49/2022_OR_DB27594B", + "RIYADH/88/2023_OR_E11E47BF", + "RIYADH/66/2022_OR_03C7C7E0", + "TOULON/INF_4/2023_OR_A4196D00", + # 2025-1126 + "CAMBODIA/KPH250113/2025_OR_EABF2384", + "CAMBODIA/IAHC250287/2025_OR_3BF3D0EB", + "CAMBODIA/IMDK250044/2025_OR_037B06A4", + "ASTURIAS/232557424/2025_A6C56596", + ] + ) .remove_hash_duplicates() .replace_with_master() .sort("+date") .move_name_to_beginning(outgroup) ) - report(subtype, selected, expected_nuc_size=1647, limit=0) # aa 549 + report(subtype, selected, expected_nuc_size=1647, limit=0) # aa 549 export(subtype, selected) + # ---------------------------------------------------------------------- + def h3(subtype: str, args: argparse.Namespace): outgroup = "PERTH/16/2009" selected = ( @@ -174,141 +523,321 @@ def h3(subtype: str, args: argparse.Namespace): .human() .exclude_with_issue() .filter_dates(first="2018-03") - .find_masters() # to be able to use .filter below - .include_name([ - outgroup, - "VICTORIA/361/2011", # 2011-10-24 3C.3 - # [2020-09-07] Nearest common ancestor of 2a and 3a in /syn/eu/ac/results/eu/2019-0417-h3-trees/cdc-usa-2009-2019-250-per-year-1.rough-29532.tree.pdf https://notebooks.antigenic-cartography.org/eu/results/eu/2019-0417-h3-trees/ - "MARYLAND/30/2012", # 2012-07-26 3C.3 - # [2020-08-17] Derek thinks that the root (A/STOCKHOLM/6/2014) is not old enough - "TEXAS/50/2012", # 2012-04-15 3C.3 root in MELB tree 2020 - "VERMONT/6/2012", - "STOCKHOLM/6/2014", # 2014-02-06 3a - # [2020-02-07] intermediate strains from nextflu https://nextstrain.org/flu/seasonal/h3n2/ha/2y to keep 3a at the top - "SWITZERLAND/9715293/2013", # 2013-12-06 3a - "NORWAY/466/2014", # 2014-02-03 3a - "SOUTH_AUSTRALIA/55/2014", # 2014-06-29 3a - "TASMANIA/11/2014", # 2014-03-16 3a - "KOBE/63/2014", # 2014-05-21 3a - "PERU/27/2015", # 2015-04-13 3a - "NEVADA/22/2016", # 2016-03-05 3a - "IDAHO/33/2016", # 2016-06-08 3a - "TEXAS/88/2016", # 2016-02-25 3a - "TEXAS/71/2017", # 2017-03-18 3a - "BRAZIL/7331/2018", # 2018-07-09 3a - "KANSAS/14/2017", # 2017-12-14 3a, to have serum circles in the sig page - "HONG_KONG/4801/2014", # 2014-02-26 2a - "HAWAII/47/2014", # 2014-07-18 2a - "NORTH_CAROLINA/4/2017", # 2017-01-26 2a2 - "NORTH_CAROLINA/4/2016", # 2016-01-14 2a1 - "ANTANANARIVO/1067/2016", # 2016-04-06 2a1 - "HONG_KONG/2286/2017", # 2017-05-23 2a1b 135K - "WISCONSIN/327/2017", # 2017-09-22 2a1b 135K - "ABU_DHABI/240/2018", # 2018-01-01 2a1b 135K - "JAMAICA/1447/2018", # 2018-02-19 2a1b 131K - - # Strains before and after T135N substitution to have a better 135N branch placement - # Sarah 2021-02-08 17:05 - "WISCONSIN/85/2016", - "SRI_LANKA/56/2017", - "SOUTH_CAROLINA/4/2017", - "YOKOHAMA/145/2017", - "INDIA/9930/2017", - "HONG_KONG/3118/2017", - "HAWAII/47/2014", - "NIIGATA-C/43/2015", - "DAKAR/17/2016", - # "CAMEROON/16V-9267/2016", # excluded, truncated sequence - "LAOS/3008/2016", - "YUNNAN_LINXIANG/1718/2016", - "HONG_KONG/2302/2016", - "ONTARIO/RV2414/2015", - "ONTARIO/RV2414/2015", - "HONG_KONG/2286/2017", - "HONG_KONG/2286/2017", - "HONG_KONG/2286/2017", - "HONG_KONG/2286/2017", - "HONG_KONG/2286/2017", - ]) - .exclude_name([ - "SWINE/", - # long branch (2021-0622) - "WISCONSIN/1/2021", - # long branch (2021-1117) - "HAWAII/28/2020", - "SOUTH_AUSTRALIA/85/2018", - "SOUTH_AUSTRALIA/1/2021", - "MANITOBA/3/2021", - "GUANGXI_GANGBEI/190/2019", - # 2021-1217 potential long branch - "OHIO/6/2021_ISOLATE", - # 2022-0204 potential long branch - "FRANCE/ARA-HCL021219268801/2021", - "FRANCE/ARA-HCL021220304101/2021", - # 2022-0209 long branch, eight X, R in nuc - "ONTARIO/RV2002/2019_P1_0F1E8F1E", - # 2022-0415 potential long branch - "ACRE/181823-IEC/2021", - # 2022-0819 potential long branch - "CATALONIA/NSVH101883267/2022_OR_F79C7813", - # 2022-0825 long branch in the tree - "INDIANA/27/2018_SIAT1_26C3A2A4", - "SWINE/KENTUCKY/18TOSU1939/2018_26C3A2A4", - "SWINE/ILLINOIS/18TOSU1411/2018_26C3A2A4", - # 2022-0826 potential long branch - "FIJI/99/2022_OR_80B95BF8", - "FIJI/102/2022_OR_6AEB79D8", - "WEST_VIRGINIA/25/2022_SIAT1_3E592413", - "WEST_VIRGINIA/26/2022_SIAT1_3E592413", - "WEST_VIRGINIA/24/2022_SIAT1_2FDA4DA3", - # 2022-0902 potential long branch - "FRANCE/ARA-HCL022060609601/2022_65F4520B", - # 2022-10-07 potential long branch - "CATALONIA/NSVH101908168/2022_OR_D5C0176E", - # 2022-10-28 potential long branch - "MICHIGAN/48/2022_OR_910196F9", - # 2022-11-04 potential long branch - "NEW_MEXICO/19/2022_IRX_4BA11056", - # 2022-11-11 potential long branches - "CATALONIA/NSVH534131326/2022_OR_4AC47F3F", - "A/ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_MIXED_ECE1_DCC5FAD7", - "ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_ECE1_DCC5FAD7", - # 2022-11-26 potential long branches - "CATALONIA/NSVH101954411/2022_OR_7A633967", - "CATALONIA/NSVH510598655/2022_OR_57BCC58B", - # 2022-12-02 potential long branches - "MISSOURI/19/2022_OR_837C934E", - # 2022-12-10 potential long branches - "CATALONIA/NSVH560028269/2022_OR_9C189CFA", # 159X 160X 162X 163X 164X - "ATHENS/355/2022_D9D4A86A", # 38X 40X 41X 42X 59X 67X 81X 128X - "MICHIGAN/48/2022_OR_910196F9", - "MICHIGAN/UOM10047284024/2022_OR_910196F9", - # 2023-01-08 potential long branches - "BRITISH_COLUMBIA/PHL-480/2022_OR_EAAE7C01", - # 2023-01-13 potential long branch - "GALICIA/22034482/2022_2EE89148", - "GALICIA/22018524/2022_1EE46082", - "GALICIA/22033804/2022_0EE699D6", - "GALICIA/21987208/2022_F49E5748", - "GALICIA/22036079/2022_430E996A", - "GALICIA/22018743/2022_0116B017", - "GALICIA/22011636/2022_3D9F40AD", - # 2023-01-20 potential long branch - "SINGAPORE/GP9117/2022_OR_69761A51", - "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", - # 2023-01-28 potential long branches - "MICHIGAN/48/2022_OR_910196F9", - "MACEDONIA/IPH-MKD-59729/2021_OR_837D0C9D", - "KHON_KAEN/FS005/2022_OR_1A5637D8", - "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", - "ATHENS/355/2022_D9D4A86", - "CATALONIA/NSVH101994299/2023_OR_13A43DA9", - "MACEDONIA/IPH-MKD-HD/2021_OR_B52D1807", - "SOUTH_AFRICA/R07603/2022_OR_7937FE46", - "SOUTH_AFRICA/R05772/2022_OR_552B5F5E", - "GALICIA/22034482/2022_2EE89148", - "CATALONIA/NSVH560028269/2022_OR_9C189CFA", - ]) + .find_masters() # to be able to use .filter below + .include_name( + [ + outgroup, + "VICTORIA/361/2011", # 2011-10-24 3C.3 + # [2020-09-07] Nearest common ancestor of 2a and 3a in /syn/eu/ac/results/eu/2019-0417-h3-trees/cdc-usa-2009-2019-250-per-year-1.rough-29532.tree.pdf https://notebooks.antigenic-cartography.org/eu/results/eu/2019-0417-h3-trees/ + "MARYLAND/30/2012", # 2012-07-26 3C.3 + # [2020-08-17] Derek thinks that the root (A/STOCKHOLM/6/2014) is not old enough + "TEXAS/50/2012", # 2012-04-15 3C.3 root in MELB tree 2020 + "VERMONT/6/2012", + "STOCKHOLM/6/2014", # 2014-02-06 3a + # [2020-02-07] intermediate strains from nextflu https://nextstrain.org/flu/seasonal/h3n2/ha/2y to keep 3a at the top + "SWITZERLAND/9715293/2013", # 2013-12-06 3a + "NORWAY/466/2014", # 2014-02-03 3a + "SOUTH_AUSTRALIA/55/2014", # 2014-06-29 3a + "TASMANIA/11/2014", # 2014-03-16 3a + "KOBE/63/2014", # 2014-05-21 3a + "PERU/27/2015", # 2015-04-13 3a + "NEVADA/22/2016", # 2016-03-05 3a + "IDAHO/33/2016", # 2016-06-08 3a + "TEXAS/88/2016", # 2016-02-25 3a + "TEXAS/71/2017", # 2017-03-18 3a + "BRAZIL/7331/2018", # 2018-07-09 3a + "KANSAS/14/2017", # 2017-12-14 3a, to have serum circles in the sig page + "HONG_KONG/4801/2014", # 2014-02-26 2a + "HAWAII/47/2014", # 2014-07-18 2a + "NORTH_CAROLINA/4/2017", # 2017-01-26 2a2 + "NORTH_CAROLINA/4/2016", # 2016-01-14 2a1 + "ANTANANARIVO/1067/2016", # 2016-04-06 2a1 + "HONG_KONG/2286/2017", # 2017-05-23 2a1b 135K + "WISCONSIN/327/2017", # 2017-09-22 2a1b 135K + "ABU_DHABI/240/2018", # 2018-01-01 2a1b 135K + "JAMAICA/1447/2018", # 2018-02-19 2a1b 131K + # Strains before and after T135N substitution to have a better 135N branch placement + # Sarah 2021-02-08 17:05 + "WISCONSIN/85/2016", + "SRI_LANKA/56/2017", + "SOUTH_CAROLINA/4/2017", + "YOKOHAMA/145/2017", + "INDIA/9930/2017", + "HONG_KONG/3118/2017", + "HAWAII/47/2014", + "NIIGATA-C/43/2015", + "DAKAR/17/2016", + # "CAMEROON/16V-9267/2016", # excluded, truncated sequence + "LAOS/3008/2016", + "YUNNAN_LINXIANG/1718/2016", + "HONG_KONG/2302/2016", + "ONTARIO/RV2414/2015", + "ONTARIO/RV2414/2015", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + ] + ) + .exclude_name( + [ + "SWINE/", + # long branch (2021-0622) + "WISCONSIN/1/2021", + # long branch (2021-1117) + "HAWAII/28/2020", + "SOUTH_AUSTRALIA/85/2018", + "SOUTH_AUSTRALIA/1/2021", + "MANITOBA/3/2021", + "GUANGXI_GANGBEI/190/2019", + # 2021-1217 potential long branch + "OHIO/6/2021_ISOLATE", + # 2022-0204 potential long branch + "FRANCE/ARA-HCL021219268801/2021", + "FRANCE/ARA-HCL021220304101/2021", + # 2022-0209 long branch, eight X, R in nuc + "ONTARIO/RV2002/2019_P1_0F1E8F1E", + # 2022-0415 potential long branch + "ACRE/181823-IEC/2021", + # 2022-0819 potential long branch + "CATALONIA/NSVH101883267/2022_OR_F79C7813", + # 2022-0825 long branch in the tree + "INDIANA/27/2018_SIAT1_26C3A2A4", + "SWINE/KENTUCKY/18TOSU1939/2018_26C3A2A4", + "SWINE/ILLINOIS/18TOSU1411/2018_26C3A2A4", + # 2022-0826 potential long branch + "FIJI/99/2022_OR_80B95BF8", + "FIJI/102/2022_OR_6AEB79D8", + "WEST_VIRGINIA/25/2022_SIAT1_3E592413", + "WEST_VIRGINIA/26/2022_SIAT1_3E592413", + "WEST_VIRGINIA/24/2022_SIAT1_2FDA4DA3", + # 2022-0902 potential long branch + "FRANCE/ARA-HCL022060609601/2022_65F4520B", + # 2022-10-07 potential long branch + "CATALONIA/NSVH101908168/2022_OR_D5C0176E", + # 2022-10-28 potential long branch + "MICHIGAN/48/2022_OR_910196F9", + # 2022-11-04 potential long branch + "NEW_MEXICO/19/2022_IRX_4BA11056", + # 2022-11-11 potential long branches + "CATALONIA/NSVH534131326/2022_OR_4AC47F3F", + "A/ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_MIXED_ECE1_DCC5FAD7", + "ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_ECE1_DCC5FAD7", + # 2022-11-26 potential long branches + "CATALONIA/NSVH101954411/2022_OR_7A633967", + "CATALONIA/NSVH510598655/2022_OR_57BCC58B", + # 2022-12-02 potential long branches + "MISSOURI/19/2022_OR_837C934E", + # 2022-12-10 potential long branches + "CATALONIA/NSVH560028269/2022_OR_9C189CFA", # 159X 160X 162X 163X 164X + "ATHENS/355/2022_D9D4A86A", # 38X 40X 41X 42X 59X 67X 81X 128X + "MICHIGAN/48/2022_OR_910196F9", + "MICHIGAN/UOM10047284024/2022_OR_910196F9", + # 2023-01-08 potential long branches + "BRITISH_COLUMBIA/PHL-480/2022_OR_EAAE7C01", + # 2023-01-13 potential long branch + "GALICIA/22034482/2022_2EE89148", + "GALICIA/22018524/2022_1EE46082", + "GALICIA/22033804/2022_0EE699D6", + "GALICIA/21987208/2022_F49E5748", + "GALICIA/22036079/2022_430E996A", + "GALICIA/22018743/2022_0116B017", + "GALICIA/22011636/2022_3D9F40AD", + # 2023-01-20 potential long branch + "SINGAPORE/GP9117/2022_OR_69761A51", + "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", + # 2023-01-28 potential long branches + "MICHIGAN/48/2022_OR_910196F9", + "MACEDONIA/IPH-MKD-59729/2021_OR_837D0C9D", + "KHON_KAEN/FS005/2022_OR_1A5637D8", + "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", + "ATHENS/355/2022_D9D4A86", + "CATALONIA/NSVH101994299/2023_OR_13A43DA9", + "MACEDONIA/IPH-MKD-HD/2021_OR_B52D1807", + "SOUTH_AFRICA/R07603/2022_OR_7937FE46", + "SOUTH_AFRICA/R05772/2022_OR_552B5F5E", + "GALICIA/22034482/2022_2EE89148", + "CATALONIA/NSVH560028269/2022_OR_9C189CFA", + # 2023-02-04 potential long branches + "MANITOBA/RV00728/2022_P1_5236154F", + "ONTARIO/198/2022_OR_D1EC3B40", + # 2023-04-21 potential long branches + "ROMANIA/493/2023_612606B9", + "PAKISTAN/1140/2023_OR_3D2D111B", + "CATALONIA/NSVH102045605/2023_OR_7490FFA8", + # XXX & long branch 2023-0731 + "BAHIA/PVM105707/2021_OR_BCE07E46", + "SOUTH_AFRICA/K056301/2023_30E0E639", + # XXXXX 2023-0811 CORRECTED 2023-1023 + "BAHIA/292038965/2021_70230607", + "BAHIA/292104308/2022_D4526D62", + # XXXXX 2023-0829 CORRECTED 2023-1023 + "SOUTH_AFRICA/PET28428/2023_7572E223", + "SOUTH_AFRICA/PET28193/2023_BF46BBB4", + "NORWAY/8084/2023_OR_4548A267", + # 2023-09-08 potential long branch ?why + "SOUTH_AFRICA/K056301/2023_CC1FE952" + # 2023-10-23 XXX + "PARA/CEVI_2200263/2021_OR_6D24213E", + "SAO_PAULO/CEVI_2200855/2022_OR_D4E825BF", + "SAO_PAULO/CEVI_2200871/2022_OR_280FCE18", + # 2023-10-28 + "SOUTH_AFRICA/K056301/2023_CC1FE952", # very divergent + # 2023-12-04 XXXX + "OMAN/CPHL_52323056/2023_OR_297A7E60", + "LUSAKA/1-NIC-332/2023_OR_BA018CE9", + "NORWAY/11072/2023_OR_EB1EA580", + # 2024-01-01 + "SAINT_PETERSBURG/RII-MH151473S/2023_OR_D20D279D", # XX + "SAINT_PETERSBURG/RII-4937S/2023_OR_95887A8C", # XX + "CANADA/NOVASCOTIA-32036/2022_A90EF453", # first 7 aa + "BRITISH_COLUMBIA/PHL-602/2022_OR_A1B8E845", # XX + # 2024-01-22 for 2024-01-16 file + "INDONESIA/BIOKES-IBKL1127/2023_OR_791937E7", # long branch + "SAO_PAULO/CEVI_2201055/2022_OR_BF9D3752", # XX + "ROMANIA/3227/2023_E20D207E", # XX + "SAO_PAULO/CEVI_2200903/2022_OR_BB30A339", # XX + "CACERES/50644480/2023_361FA786", # long branch + "BELGIUM/S1112/2023_OR_D7ABA060", # long branch + "BELGIUM/S0576/2023_OR_CEC5B896", # long branch + "BELGIUM/S0220/2023_OR_0B549607", # long branch + "BELGIUM/H0592/2022_OR_22BD830C", # long branch + # 2024-01-30 + "SPAIN/GA-HULA-615019/2023_B7077195", + "BELGIUM/S03941/2023_F4250630", + "NEW_YORK/392/2004_CFB2F75C", + "SPAIN/GA-HULA-615037/2023_E8183B9E", + "CACERES/59816379/2023_4D9F91F8", + "TURKEY/GIHSN_ANKARA-140111_950/2023_E0D8B88D", + # 2024-02-11 XXXX Indonesia is long branch + "KENYA/AFI-KDH-1820/2023_B3509344", + "INDONESIA/BIOKES-IDSW0063/2023_OR_5962326E", + "KENYA/AFI-ISB-1107/2023_E5F19B22", + "KENYA/AFI-MNB-1100/2023_6D42271", + # 2024-0703 + "CHIANG_RAI/P2287/2024_OR_2744AA9C", # 3 deletions + # 2024-0727 + "SASKATCHEWAN/RV03900/2024_OR_029CB62D", # 198 diffs + "WASHINGTON/S25532/2024_OR_13A107A6", # XXXX + "OREGON/S25529/2024_OR_5502D106", # XXXX + "OREGON/S25537/2023_OR_BE6502FF", # XXXX + "CAMBODIA/CCH230307/2023_7CC43803", # XXXX + "CAMBODIA/IAHC230253/2023_50A155E0", # XXXX + # 2024-08-06 + "VIRGINIA/H20-355-5372/2024_OR_53F1E92E", # XXXXX + "PANAMA/M273491/2024_OR_27E79A9B" # XXXXX + # 2024-0830 many changes + "COLORADO/150/2024_OR_2BD9D59A", + "COLORADO/150/2024_SIAT1_14B3FB98", + "MICHIGAN/113/2024_SIAT1_B0E6BEF3", + # 2024-0902 + "SASKATCHEWAN/RV03900/2024_OR_029CB62D", # long branch + "PANAMA/M273491/2024_OR_27E79A9B", # XXXX + # 2024-1004 + "CAMBODIA/NPH240499/2024_OR_F38419C8", # XXXX + # 2024-1025 + "CAMBODIA/NPH240575/2024_OR_33D3499A", + # 2024-1115 + "PARANA/44706/2021_OR_25BFEB40", + "SANTA_HELENA/PARANA_44706/2021_OR_25BFEB40", + # 2024-0111 (tree built 2024-1811) + "GREATER_ACCRA_REGION/FS-24-4832/2024_B7004939", + "GREATER_ACCRA_REGION/FS-24-4779/2024_1393BD1C", + # 2024-2611 + "QUEENSLAND/IN000643/2024_1ACA7E5A", + "QUEENSLAND/IN000245/2024_0854A342", + "WASHINGTON/S26082/2022_OR_7462BFDE", + "WASHINGTON/S26079/2022_OR_07B0CEEC", + "WASHINGTON/S26107/2023_OR_A56901B3", + "WASHINGTON/S26104/2022_OR_E80CAD50", + "WASHINGTON/S26097/2022_OR_40AE3B8C", + "WASHINGTON/S26077/2022_OR_C294BE90", + "WASHINGTON/S26098/2022_OR_1EDB1957", + "WASHINGTON/S26088/2022_OR_C6B3FAA9", + # 2024-0912 + "GALICIA/28220935/2023_C495745E", + "GALICIA/33341960/2023_0FF7D1C8", + # 2025-2401 + "TURKMENISTAN/784/2025_OR_F898A707", + # 2025-1202 + "COTE_DIVOIRE/4331/2024_OR_E0FF092D", + # 2025-1202 + "WISCONSIN/67/2005_XXMDCK2_8C52B46C", + # 2025-0716 (omitted exclusions in error, added 29/07 PR) + "INDIA/BHO_NIV25-827/2025_MDCK3_C54368E2", + "CAMBODIA/IAHC240402/2024_OR_35F1131A", + "INDIA/PUN_NIVCOV2572/2025_MDCK1_C9AF818D", + "ASTURIAS/232524600/2025_A4329A8F", + "OREL/SWINE-RII-WD2652S/2025_OR_57A8B37C", + "OREL/SWINE-RII-WD2687S/2025_OR_C384E32B", + "MEXICO/2496813201/2025_OR_E2853515", + # 2025-0723 + "NORWAY/6870/2025_OR_828769BE", + "BRITISH_COLUMBIA/RV00479/2025_SIAT2/SIAT1_0DA227AA", + "QUEBEC/240/2025_OR_8D4D59A8", + "GALICIA/52520233/2025_F71B58B8", + "BULGARIA/2506/2025_SIAT2/SIAT1_8B0BEF2D", + "BULGARIA/1941/2025_SIAT2/SIAT1_0D5E6F90", + "CAMBODIA/IBTB250007/2025_OR_4E893E81", + "ASTURIAS/232525925/2025_719B9A6C", + "BAVARIA/6511057/2025_OR_D759C9B5", + "MEXICO/2496813201/2025_OR_E2853515", + "CAMBODIA/SVH250074/2025_OR_2253D125", + "ALBANIA/310521/2025_SIAT1_5DCA4F3D", + "CAMBODIA/IAHC250174/2025_OR_6BFB38A8", + "FRANCE/ARA-HCL023004739801/2023_OR_003FC67E", + "SOUTH_AFRICA/NHLS-UCT-CERI-C067966/2025_EE268891", + "PERU/LIM-UPCH-4915/2025_1E953D52", + # 2025-0729 no exclusions + # 2025-0818 + "INDIA/BHO-NIV25-1226/2025_MDCK1_2E99D391", + "INDIA/PUD-NIV25-1183/2025_MDCK4_1BD68829", + "INDIA/JAI-NIV25-1120/2025_MDCK1_9051692B", + "ALBANIA/310569/2025_SIAT1_A284C73A", + "SWINE/OHIO/A02858604/2025_OR_3F9C4EB8", + "SWINE/NORTH_CAROLINA/A02979483/2025_OR_9F18313D", + "SWINE/IOWA/A02858606/2025_OR_8367F95A", + "SWINE/IOWA/A02858602/2025_OR_6B07CDAC", + "SWINE/MINNESOTA/A02858601/2025_OR_8A30B201", + "SWINE/MINNESOTA/A02979412/2025_OR_1838EF26", + # 2025-0827 no exclusions + # 2025-0910 no exclusions + # 2025-0917 + "INDIA/BHO-NIV25-1226/2025_MDCK1_2E99D391", + "INDIA/PUD-NIV25-1183/2025_MDCK4_1BD68829", + "ALBANIA/310569/2025_SIAT1_A284C73A", + "INDIA/AS-RMRC-DIB-1189/2023_D09F02E4", + # 2025-1006 + "MEXICO/2578825701/2025_OR_B6857194", + "CAMPO_GRANDE/LEIAL6538/2025_OR_3083EF3A", + # 2025-1028 + "INDIA/DIB-NIV25-1297/2025_DEAC04B8", + "INDIA/PUN-NIV25-1199/2025_EA34056E", + "INDIA/DEL-NIV25-1587/2025_01FD20A7", + "INDIA/PUN-NIVCOV251684/2025_95A3CE8E", + "INDIA/PUD-NIV25-1176/2025_8B8D9E17", + "INDIA/LUK-NIV25-1336/2025_4C35C9DE", + "INDIA/DIB-NIV25-1309/2025_0D480D68", + "INDIA/LUK-NIV25-1344/2025_ED98BB3E", + "INDIA/PUD-NIV25-1341/2025_663D69FB", + "GALICIA/GA-CHUAC-478/2025_OR_AE004DEB", + # 2025-1112 + "ESPIRITO_SANTO/4988/2025_OR_C4DA8B91", + # 2025-1126 + "CAMBODIA/NPH250828/2025_OR_B75D636B", + "CAMBODIA/KVH250244/2025_OR_DB084CEF", + "CAMBODIA/ISVR250088/2025_OR_8696921D", + "CAMBODIA/KPH250212/2025_OR_0DF92A52", + "CAMBODIA/INPH250362/2025_OR_6FD76539", + "CAMBODIA/AHC250325/2025_OR_10F98EBA", + "CAMBODIA/NPH250817/2025_OR_DD4119F8", + "CAMBODIA/AHC250341/2025_OR_D6BA3933", + "CAMBODIA/IKPT250105/2025_OR_6BA23E98", + "KILIFI/MAT220593/2021_OR_E96D0DC0", + "MINAS_GERAIS/CEVI_2501233/2025_OR_ACDD91A4", + ] + ) .remove_hash_duplicates() .replace_with_master() .sort("+date") @@ -317,41 +846,61 @@ def h3(subtype: str, args: argparse.Namespace): report(subtype, selected, expected_nuc_size=1650, limit=0) export(subtype, selected) + # ---------------------------------------------------------------------- sSubtypeToFunc = {"BVIC": bvic, "BYAM": byam, "A(H1N1)": h1, "A(H3N2)": h3} # ---------------------------------------------------------------------- -def export(subtype: str, selected :ae_backend.seqdb.Selected): - fasta.write(filename_or_stream=subtype_dir(subtype).joinpath("source.fas"), selected=selected, aa=False) + +def export(subtype: str, selected: ae_backend.seqdb.Selected): + fasta.write( + filename_or_stream=subtype_dir(subtype).joinpath("source.fas"), + selected=selected, + aa=False, + ) + # ---------------------------------------------------------------------- + def subtype_dir(subtype: str): dir = Path(sSubdirforSubtype[subtype]) dir.mkdir(exist_ok=True) return dir + # ---------------------------------------------------------------------- -def report(subtype: str, selected :ae_backend.seqdb.Selected, expected_nuc_size: int, limit: int = 20): + +def report( + subtype: str, + selected: ae_backend.seqdb.Selected, + expected_nuc_size: int, + limit: int = 20, +): print(f"{subtype:7s} {len(selected)}") for seq in selected: if len(seq.nuc) != expected_nuc_size: - print(f">> wrong size {len(seq.nuc)} of {seq.seq_id()} (expected: {expected_nuc_size}) ") + print( + f">> wrong size {len(seq.nuc)} of {seq.seq_id()} (expected: {expected_nuc_size}) " + ) for no, seq in enumerate(selected): if no >= limit: break # print(f"{no:4d} {seq.seq_id():40s} [{seq.date()}] {seq.aa}") print(f"{no:4d} {seq.seq_id():40s} [{seq.date()}]") + # ====================================================================== try: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("subtype", nargs="*", type=lambda src: src.upper()) - parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", default=False) + parser.add_argument( + "-v", "--verbose", dest="verbose", action="store_true", default=False + ) args = parser.parse_args() exit_code = main(args) or 0 except Exception as err: diff --git a/proj/weekly-tree/export~ b/proj/weekly-tree/export~ new file mode 100755 index 0000000..b5cbf12 --- /dev/null +++ b/proj/weekly-tree/export~ @@ -0,0 +1,437 @@ + +#! /usr/bin/env python3 + +import sys, os, argparse, traceback +from pathlib import Path +sys.path[:0] = [str(Path(os.environ["AE_ROOT"], dir)) for dir in ["build", "py"]] +import ae_backend +from ae.sequences.source import fasta + +# ====================================================================== + +sSubtypeForSeqdb = {"B": "B", "BVIC": "B", "BYAM": "B", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubtypeNorm = {"BV": "BVIC", "BVIC": "BVIC", "BY": "BYAM", "BYAM": "BYAM", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": "h3"} + +def main(args: argparse.Namespace): + subtypes = [sSubtypeNorm[st] for st in (args.subtype or ["BVIC", "BYAM", "H1", "H3"])] + for subtype in subtypes: + sSubtypeToFunc[subtype](subtype=subtype, args=args) + +# ---------------------------------------------------------------------- + +def bvic(subtype: str, args: argparse.Namespace): + outgroup = "VICTORIA/830/2013" + selected = ( + ae_backend.seqdb.for_subtype(sSubtypeForSeqdb[subtype]) + .select_all() + .human() + .lineage("VICTORIA") + .exclude_with_issue() + .filter_dates(first="2016-06") + .find_masters() # to be able to use .filter below + .exclude(lambda ref: ref.aa["165-"]) # exclude 6-del and MIE/1/2019 with garbage in the middle + .exclude_name([ + "ANHUI_JINGHU/63/2019", # garbage in 406-414 leading to a long branch (2021-0722) + "LISBOA/NISU182_17-18/2018", # yamagata, long branch 2021-1117 + # "ZAMBIA/503/2021", # potential long branch 2022-0401 + # 2023-01-28 potential long branch + "KRABI/FS002/2022", + "CATALONIA/NSVH151199301/2023_OR_0ACCE47F", # XXXX 2023-0421 + "SOUTH_AFRICA/R10785/2022_OR_0EB8BBEA", # XXX 2023-0515 + "SOUTH_AFRICA/R10907/2022_OR_BA5B358C", + "SOUTH_AFRICA/R09298/2022_OR_1A5C630A", + "SOUTH_AFRICA/R10914/2022_OR_ADA6D682", + "SOUTH_AFRICA/R10638/2022_OR_5EAB4CBB", + "SOUTH_AFRICA/R11310/2022_OR_12125C84", + "SOUTH_AFRICA/R11575/2022_OR_FED84F4F", + "SOUTH_AFRICA/R08092/2022_OR_93DDF2B2", + # XXX 2023-0602 + "LA_REUNION/HCL023041635201/2022_103E7754", + "ROMANIA/550950/2023_MDCK-SIAT_E0DECCD2", + "SAINT_PETERSBURG/RII-10866S/2023_OR_477275F7", + # XXX2023-0629 + "ABU_DHABI/4030/2022_AEC2DE48", + "RIO_GRANDE_DO_SUL/28773/2022_455F4FFF", + # 2023-0731 + "MALAYSIA/IMR-SARI0563/2023_25FE3E0E", + #2023-0811 Yam + "KENYA/KCH157321/2019_OR_308AC34A", + # 2023-0822 XXX + "ALAGOAS/5742/2023_OR_441065AD", + "ALAGOAS/4659/2023_OR_558AC3D6", + "PARANA/5012/2023_OR_D54659DC", + # 2023-1005 XXXX + "UNITED_KINGDOM/UO-476/2023", + ]) + .include_name(outgroup) # outgroup BRISBANE/60/2008, B/CHONGQING BANAN/1840/2017 (V1A) + .remove_hash_duplicates() + .replace_with_master() + .check() + .sort("+date") + .move_name_to_beginning(outgroup) + ) + report(subtype, selected, expected_nuc_size=1710, limit=0) + export(subtype, selected) + +# ---------------------------------------------------------------------- + +def byam(subtype: str, args: argparse.Namespace): + outgroup = "MINNESOTA/2/2014" + selected = ( + ae_backend.seqdb.for_subtype(sSubtypeForSeqdb[subtype]) + .select_all() + .human() + .lineage("YAMAGATA") + .exclude_with_issue() + .filter_dates(first="2015") + .find_masters() # to be able to use .filter below + .include_name([outgroup]) # , "IDAHO/1/2014", "MASSACHUSETTS/7/2014"]) + .exclude_name([lin for lin in (line.strip() for line in Path("byam-exclude.txt").open()) if lin and lin[0] != "#"]) + .remove_hash_duplicates() + .replace_with_master() + .sort("+date") + .move_name_to_beginning(outgroup) + ) + report(subtype, selected, expected_nuc_size=1710, limit=0) + export(subtype, selected) + +# ---------------------------------------------------------------------- + +def h1(subtype: str, args: argparse.Namespace): + h1_include_txt = Path("h1-include.txt") + if not h1_include_txt.exists(): + h1_include_txt = Path(sys.argv[0]).parent.joinpath(h1_include_txt.name) + if not h1_include_txt.exists(): + raise RuntimeError("> cannot find {h1_include_txt}") + to_include = [lin for lin in (line.strip() for line in h1_include_txt.open()) if lin and lin[0] != "#"] + # to_include = to_include[:-500] + outgroup = "SOUTH_AUSTRALIA/30/2013" + selected = ( + ae_backend.seqdb.for_subtype(sSubtypeForSeqdb[subtype]) + .select_all() + .human() + .exclude_with_issue() + .filter_dates(first="2019") + .find_masters() # to be able to use .filter below + .include_name([ + outgroup, + "MICHIGAN/45/2015", + "BRAZIL/5026/2017", # # 183S, trying to show "the different 183P clades has S183P independently", inferred from 2018-0924-ssm tree + ] + to_include) # ~/AD/share/conf/h1.183S.2017-2018.seqids + .exclude_name([ + "SWINE/", + # 2020-08-10 Sarah: They're not sequenced by a CC or NIC, but by a company. The sequences don't have obvious errors, but are very different (obviously) from the rest. My feeling is they're either variant viruses (from swine) or a sequencing error. + "ANKARA/14015-724/2019", + "ANKARA/14017-004/2019", + "ANKARA/14015-736/2019", + # 2021-06-22 deletion at the end + # "BADEN-WURTTEMBERG/1/2020", + # long branch 2021-0622 + "DENMARK/1/2021", + "MANITOBA/2/2021", + # 2021-1117 long branches + "MECKLENBURG-VORPOMMERN/1/2021", + "MECKLENBURG-VORPOMMERN/1-A/2021", # refers to MECKLENBURG-VORPOMMERN/1/2021 + "NETHERLANDS/10370-1/2020", + "NETHERLANDS/10370-1A/2020", + "NETHERLANDS/10370-2/2020", # refers to NETHERLANDS/10370-1A/2020 + "NETHERLANDS/GENT-193/2019", + "HESSEN/47/2020", + "HUBEI WUJIAGANG/1324/2020", + "HEBEI HAIGANG/SWL1572/2019", + "SHANDONG/204/2021", + "YUNNAN MENGZI/1462/2020", + "NORTH CAROLINA/15/2020", + "WISCONSIN/3/2021", + "PARANA/10835/2021", + # 2021-1125 long branches + "GANSU_XIFENG/1194/2021", + "GANSU_XIFENG/1143/2021", + "SICHUAN/1208/2021", + "TIANJIN/30/2020", + "WISCONSIN/5/2021", + "WISCONSIN/4/2021", + "NORTH_DAKOTA/12226/2021", + # 2021-1217 potential long branch + "IOWA/1/2021", # --> SWINE/MISSOURI/A01104146/2020 (base), SWINE/MISSOURI/A02525160/2021 + "IOWA/6/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" + "NORTH DAKOTA/12226/2021", # --> "SWINE/IOWA/A02636087/2021", "SWINE/IOWA/A02636130/2021" + # 2021-1217 long branches + "NETHERLANDS/10370-1A/2020", + # 2022-0131 potential long branch + "DENMARK/36/2021", + # 2022-04-26 mess in 156-179 + "INDIA/NIV-NICOBAR91/2020_OR_01A87F80", + "INDIA/NIV-NICOBAR163/2020_OR_823B2A0F", + "INDIA/NIV-NICOBAR106/2020_OR_B81F1DC2", + "INDIA/NIV-NICOBAR164/2020_OR_37A11133", + # 2022-10-07 potential long branch + "PARANA/20675/2022_OR_3D8D671C", + # 2022-12-10 potential long branch + "DAKAR/3124/2021_279E04A6", + # 2022-12-23 potential long branch + "NAVARRA/4050/2022_OR_BBC31B3A", + # 2023-01-13 potential long branch + "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", + # 2023-01-28 potential long branches + "NAVARRA/4050/2022_OR_BBC31B3A", + "DAKAR/3124/2021_279E04A6 pdm09-like", + "PAKISTAN/GIHSN-GB-445/2022_OR_A20D55B6", + "PAKISTAN/GIHSN-ICT-152/2023_OR_572EBEB8", + "PAKISTAN/GIHSN-PB-15/2023_OR_484A375D", + "SAO_PAULO/356765942-IAL/2022_OR_71BA8B9D", + "CATALONIA/NSVH101996495/2023_OR_2E610047", + # 2023-02-04 potential long branch + "SWITZERLAND/69094/2022_OR_B3C15C9B", + # 2023-0421 potential long branches + "CATALONIA/NSVH510598655/2022_OR_57BCC58B", + "PAKISTAN/1465/2023_OR_1B7B0A1A", + "BADEN-WURTTEMBERG/159/2022_OR_6AB83E7E", + "PARANA/44706/2021_OR_25BFEB40", + # XXX 2023-0515 + "A/PAKISTAN/GIHSN/ICT/1838/2023_0867EDC7", + "A/PAKISTAN/GIHSN/ICT/1634/2023_OR_3D55578F", + "A/PAKISTAN/GIHSN/ICT/1798/2023_OR_C271B3C9", + "A/PAKISTAN/GIHSN/ICT/1682/2023_OR_FB7A0386", + # XXX 2023-0602 + "PAKISTAN/GIHSN-ICT2035/2023_OR_4E73FFF6", + # XXX 2023-0731 + "CYCLADES/215/2022_D164055B", + #long branch & XXXX 2023-0811 + "GREECE/SRIHER-001/2022_OR_E68E256A", + "SAUDI_ARABIA/40/2020_OR_9B1E74A4", + "SAUDI_ARABIA/39/2020_OR_DAE6A100", + # 2023-0822 XXXX + "PARANA/6405/2023_OR_9ACD4010", + "BAHIA/5785/2023_OR_CBFA332A", + # 2023-0903 very long branch + "NETHERLANDS/10534/2023_OR_6221D5BF", + # 2023-09-08 long branch in tree + "BADAJOZ/18570374/2023_58158069", + # 2023-09-15 XXXX + "ALGIERS/251/2023_OR_1A27768", + ]) + .remove_hash_duplicates() + .replace_with_master() + .sort("+date") + .move_name_to_beginning(outgroup) + ) + report(subtype, selected, expected_nuc_size=1647, limit=0) # aa 549 + export(subtype, selected) + +# ---------------------------------------------------------------------- + +def h3(subtype: str, args: argparse.Namespace): + outgroup = "PERTH/16/2009" + selected = ( + ae_backend.seqdb.for_subtype(sSubtypeForSeqdb[subtype]) + .select_all() + .human() + .exclude_with_issue() + .filter_dates(first="2018-03") + .find_masters() # to be able to use .filter below + .include_name([ + outgroup, + "VICTORIA/361/2011", # 2011-10-24 3C.3 + # [2020-09-07] Nearest common ancestor of 2a and 3a in /syn/eu/ac/results/eu/2019-0417-h3-trees/cdc-usa-2009-2019-250-per-year-1.rough-29532.tree.pdf https://notebooks.antigenic-cartography.org/eu/results/eu/2019-0417-h3-trees/ + "MARYLAND/30/2012", # 2012-07-26 3C.3 + # [2020-08-17] Derek thinks that the root (A/STOCKHOLM/6/2014) is not old enough + "TEXAS/50/2012", # 2012-04-15 3C.3 root in MELB tree 2020 + "VERMONT/6/2012", + "STOCKHOLM/6/2014", # 2014-02-06 3a + # [2020-02-07] intermediate strains from nextflu https://nextstrain.org/flu/seasonal/h3n2/ha/2y to keep 3a at the top + "SWITZERLAND/9715293/2013", # 2013-12-06 3a + "NORWAY/466/2014", # 2014-02-03 3a + "SOUTH_AUSTRALIA/55/2014", # 2014-06-29 3a + "TASMANIA/11/2014", # 2014-03-16 3a + "KOBE/63/2014", # 2014-05-21 3a + "PERU/27/2015", # 2015-04-13 3a + "NEVADA/22/2016", # 2016-03-05 3a + "IDAHO/33/2016", # 2016-06-08 3a + "TEXAS/88/2016", # 2016-02-25 3a + "TEXAS/71/2017", # 2017-03-18 3a + "BRAZIL/7331/2018", # 2018-07-09 3a + "KANSAS/14/2017", # 2017-12-14 3a, to have serum circles in the sig page + "HONG_KONG/4801/2014", # 2014-02-26 2a + "HAWAII/47/2014", # 2014-07-18 2a + "NORTH_CAROLINA/4/2017", # 2017-01-26 2a2 + "NORTH_CAROLINA/4/2016", # 2016-01-14 2a1 + "ANTANANARIVO/1067/2016", # 2016-04-06 2a1 + "HONG_KONG/2286/2017", # 2017-05-23 2a1b 135K + "WISCONSIN/327/2017", # 2017-09-22 2a1b 135K + "ABU_DHABI/240/2018", # 2018-01-01 2a1b 135K + "JAMAICA/1447/2018", # 2018-02-19 2a1b 131K + + # Strains before and after T135N substitution to have a better 135N branch placement + # Sarah 2021-02-08 17:05 + "WISCONSIN/85/2016", + "SRI_LANKA/56/2017", + "SOUTH_CAROLINA/4/2017", + "YOKOHAMA/145/2017", + "INDIA/9930/2017", + "HONG_KONG/3118/2017", + "HAWAII/47/2014", + "NIIGATA-C/43/2015", + "DAKAR/17/2016", + # "CAMEROON/16V-9267/2016", # excluded, truncated sequence + "LAOS/3008/2016", + "YUNNAN_LINXIANG/1718/2016", + "HONG_KONG/2302/2016", + "ONTARIO/RV2414/2015", + "ONTARIO/RV2414/2015", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + "HONG_KONG/2286/2017", + # XXXXX 2023-0811 + "BAHIA/292038965/2021_70230607", + "BAHIA/292104308/2022_D4526D62", + # XXXXX 2023-0829 + "SOUTH_AFRICA/PET28428/2023_7572E223", + "SOUTH_AFRICA/PET28193/2023_BF46BBB4", + "NORWAY/8084/2023_OR_4548A267", + ]) + .exclude_name([ + "SWINE/", + # long branch (2021-0622) + "WISCONSIN/1/2021", + # long branch (2021-1117) + "HAWAII/28/2020", + "SOUTH_AUSTRALIA/85/2018", + "SOUTH_AUSTRALIA/1/2021", + "MANITOBA/3/2021", + "GUANGXI_GANGBEI/190/2019", + # 2021-1217 potential long branch + "OHIO/6/2021_ISOLATE", + # 2022-0204 potential long branch + "FRANCE/ARA-HCL021219268801/2021", + "FRANCE/ARA-HCL021220304101/2021", + # 2022-0209 long branch, eight X, R in nuc + "ONTARIO/RV2002/2019_P1_0F1E8F1E", + # 2022-0415 potential long branch + "ACRE/181823-IEC/2021", + # 2022-0819 potential long branch + "CATALONIA/NSVH101883267/2022_OR_F79C7813", + # 2022-0825 long branch in the tree + "INDIANA/27/2018_SIAT1_26C3A2A4", + "SWINE/KENTUCKY/18TOSU1939/2018_26C3A2A4", + "SWINE/ILLINOIS/18TOSU1411/2018_26C3A2A4", + # 2022-0826 potential long branch + "FIJI/99/2022_OR_80B95BF8", + "FIJI/102/2022_OR_6AEB79D8", + "WEST_VIRGINIA/25/2022_SIAT1_3E592413", + "WEST_VIRGINIA/26/2022_SIAT1_3E592413", + "WEST_VIRGINIA/24/2022_SIAT1_2FDA4DA3", + # 2022-0902 potential long branch + "FRANCE/ARA-HCL022060609601/2022_65F4520B", + # 2022-10-07 potential long branch + "CATALONIA/NSVH101908168/2022_OR_D5C0176E", + # 2022-10-28 potential long branch + "MICHIGAN/48/2022_OR_910196F9", + # 2022-11-04 potential long branch + "NEW_MEXICO/19/2022_IRX_4BA11056", + # 2022-11-11 potential long branches + "CATALONIA/NSVH534131326/2022_OR_4AC47F3F", + "A/ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_MIXED_ECE1_DCC5FAD7", + "ANAS_PLATYRHYNCHOS/BELGIUM/2499_0006/2021_ECE1_DCC5FAD7", + # 2022-11-26 potential long branches + "CATALONIA/NSVH101954411/2022_OR_7A633967", + "CATALONIA/NSVH510598655/2022_OR_57BCC58B", + # 2022-12-02 potential long branches + "MISSOURI/19/2022_OR_837C934E", + # 2022-12-10 potential long branches + "CATALONIA/NSVH560028269/2022_OR_9C189CFA", # 159X 160X 162X 163X 164X + "ATHENS/355/2022_D9D4A86A", # 38X 40X 41X 42X 59X 67X 81X 128X + "MICHIGAN/48/2022_OR_910196F9", + "MICHIGAN/UOM10047284024/2022_OR_910196F9", + # 2023-01-08 potential long branches + "BRITISH_COLUMBIA/PHL-480/2022_OR_EAAE7C01", + # 2023-01-13 potential long branch + "GALICIA/22034482/2022_2EE89148", + "GALICIA/22018524/2022_1EE46082", + "GALICIA/22033804/2022_0EE699D6", + "GALICIA/21987208/2022_F49E5748", + "GALICIA/22036079/2022_430E996A", + "GALICIA/22018743/2022_0116B017", + "GALICIA/22011636/2022_3D9F40AD", + # 2023-01-20 potential long branch + "SINGAPORE/GP9117/2022_OR_69761A51", + "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", + # 2023-01-28 potential long branches + "MICHIGAN/48/2022_OR_910196F9", + "MACEDONIA/IPH-MKD-59729/2021_OR_837D0C9D", + "KHON_KAEN/FS005/2022_OR_1A5637D8", + "TURKEY/GIHSN_ANKARA-14020_7/2022_012CD2F7", + "ATHENS/355/2022_D9D4A86", + "CATALONIA/NSVH101994299/2023_OR_13A43DA9", + "MACEDONIA/IPH-MKD-HD/2021_OR_B52D1807", + "SOUTH_AFRICA/R07603/2022_OR_7937FE46", + "SOUTH_AFRICA/R05772/2022_OR_552B5F5E", + "GALICIA/22034482/2022_2EE89148", + "CATALONIA/NSVH560028269/2022_OR_9C189CFA", + # 2023-02-04 potential long branches + "MANITOBA/RV00728/2022_P1_5236154F", + "ONTARIO/198/2022_OR_D1EC3B40", + # 2023-04-21 potential long branches + "ROMANIA/493/2023_612606B9", + "PAKISTAN/1140/2023_OR_3D2D111B", + "CATALONIA/NSVH102045605/2023_OR_7490FFA8", + # XXX & long branch 2023-0731 + "BAHIA/PVM105707/2021_OR_BCE07E46", + "SOUTH_AFRICA/K056301/2023_30E0E639", + # 2023-09-08 potential long branch ?why + "SOUTH_AFRICA/K056301/2023_CC1FE952" + ]) + .remove_hash_duplicates() + .replace_with_master() + .sort("+date") + .move_name_to_beginning(outgroup) + ) + report(subtype, selected, expected_nuc_size=1650, limit=0) + export(subtype, selected) + +# ---------------------------------------------------------------------- + +sSubtypeToFunc = {"BVIC": bvic, "BYAM": byam, "A(H1N1)": h1, "A(H3N2)": h3} + +# ---------------------------------------------------------------------- + +def export(subtype: str, selected :ae_backend.seqdb.Selected): + fasta.write(filename_or_stream=subtype_dir(subtype).joinpath("source.fas"), selected=selected, aa=False) + +# ---------------------------------------------------------------------- + +def subtype_dir(subtype: str): + dir = Path(sSubdirforSubtype[subtype]) + dir.mkdir(exist_ok=True) + return dir + +# ---------------------------------------------------------------------- + +def report(subtype: str, selected :ae_backend.seqdb.Selected, expected_nuc_size: int, limit: int = 20): + print(f"{subtype:7s} {len(selected)}") + for seq in selected: + if len(seq.nuc) != expected_nuc_size: + print(f">> wrong size {len(seq.nuc)} of {seq.seq_id()} (expected: {expected_nuc_size}) ") + for no, seq in enumerate(selected): + if no >= limit: + break + # print(f"{no:4d} {seq.seq_id():40s} [{seq.date()}] {seq.aa}") + print(f"{no:4d} {seq.seq_id():40s} [{seq.date()}]") + +# ====================================================================== + +try: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("subtype", nargs="*", type=lambda src: src.upper()) + parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", default=False) + args = parser.parse_args() + exit_code = main(args) or 0 +except Exception as err: + print(f"> {err}\n{traceback.format_exc()}", file=sys.stderr) + exit_code = 1 +exit(exit_code) + +# ====================================================================== diff --git a/proj/weekly-tree/garli-h3 b/proj/weekly-tree/garli-h3 new file mode 100755 index 0000000..49c0e63 --- /dev/null +++ b/proj/weekly-tree/garli-h3 @@ -0,0 +1,452 @@ +#! /usr/bin/env python3 + +import sys, os, re, argparse, logging, time, datetime, concurrent.futures, subprocess, random, traceback +from pathlib import Path + +# ====================================================================== + +sSubtypeNorm = {"BV": "BVIC", "BVIC": "BVIC", "BY": "BYAM", "BYAM": "BYAM", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": "h3"} + +sRandomGen = random.SystemRandom() + +# ---------------------------------------------------------------------- + +def main(args: argparse.Namespace): + argv = '"' + '" "'.join(sys.argv) + '"' + working_dir = Path(args.subdir).resolve() + if args.start_tree: + tree = args.start_tree.resolve() + else: + tree = Path(working_dir.joinpath("raxml-best.txt").open().read().strip()).resolve() + if args.source_fasta: + source_fasta = args.source_fasta.resolve() + else: + source_fasta = working_dir.joinpath("source.fas").resolve() + try: + best_tree_file = garli(working_dir=working_dir, tree=tree, source_fas=source_fasta, num_runs=args.num_runs, stoptime=args.stoptime, slurm_nodelist=args.slurm_nodelist, slurm_exclude_nodes=args.slurm_exclude_nodes, new_output=args.new_output) + subprocess.run(["/usr/bin/mail", "-s", f"completed {sys.argv[0]} {working_dir}", "weekly-tree@antigenic-cartography.org"], input=f"cd {working_dir}\n{argv}\n{best_tree_file}\n", text=True) + return 0 + except Exception as err: + subprocess.run(["/usr/bin/mail", "-s", f"FAILED {sys.argv[0]} {working_dir}", "weekly-tree@antigenic-cartography.org"], input=f"cd {working_dir}\n{argv}\n\n{traceback.format_exc()}\n", text=True) + raise + +# ---------------------------------------------------------------------- + +def garli(working_dir: Path, tree: Path, source_fas: Path, num_runs: int, stoptime: int, slurm_nodelist: str, slurm_exclude_nodes: str, new_output: bool): + + default_required_memory = 15000 + + # the number of attachment branches evaluated for each taxon to be added to the tree during the creation of an ML stepwise-addition starting tree (when garli is run without raxml step) + attachmentspertaxon = 1000000 + # termination condition: when no new significantly better scoring topology has been encountered in greater than this number of generations, garli stops + genthreshfortopoterm = 20000 + + output_dir = get_output_dir(working_dir, "garli.", new_output=new_output) + + garli_cmd = "/syn/bin/garli" + srun_cmd = [ + "srun", + "--cpus-per-task=2", + # "--nodes=1", + "--ntasks=1", + "--threads=1", + f"--nodelist={slurm_nodelist}", + ] + if slurm_exclude_nodes: + srun_cmd.append(f"--exclude={slurm_exclude_nodes}") + + # ---------------------------------------------------------------------- + + def make_conf(run_id, required_memory): + """Returns filename of the written conf file""" + # if not outgroup or not isinstance(outgroup, list) or not all(isinstance(e, int) and e > 0 for e in outgroup): + # raise ValueError("outgroup must be non-empty list of taxa indices in the fasta file starting with 1") + garli_args = { + "source": str(source_fas.resolve()), + "availablememory": required_memory or default_required_memory, + "streefname": str(tree.resolve()), + "output_prefix": str(output_dir.joinpath(run_id)), + "attachmentspertaxon": attachmentspertaxon, + "randseed": sRandomGen.randint(1, 0xFFFFFFF), + "genthreshfortopoterm": genthreshfortopoterm, + "searchreps": 1, + "stoptime": stoptime, + "outgroup": "1" # " ".join(str(e) for e in outgroup), + } + conf = GARLI_CONF.format(**garli_args) + conf_filename = output_dir.joinpath(run_id + ".garli.conf") + with conf_filename.open("w") as f: + f.write(conf) + return conf_filename + + # ---------------------------------------------------------------------- + + def find_memory_requirements(): + return default_required_memory + +# sGreat = re.compile(r"\s*great\s+>=\s+(\d+)\s+MB", re.I) +# conf_file = make_conf(run_id="find_memory_requirements", required_memory=None) +# start = time.time() +# proc = subprocess.Popen([garli_cmd, str(conf_file)], stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) +# required_memory = None # default, in case the code below fails or times out +# timeout = 300 +# while required_memory is None: +# line = proc.stdout.readline().decode("utf-8") +# if mm := sGreat.match(line): +# required_memory = int(mm.group(1)) +# elif (time.time() - start) > timeout: +# required_memory = default_required_memory +# print(f"WARNING: Cannot obtain memory required by garli in {timeout} seconds: default for {working_dir.name}: {required_memory} will be used", file=sys.stderr) +# proc.kill() +# return required_memory + + # ---------------------------------------------------------------------- + + def run_garli(run_id, required_memory): + conf_file = make_conf(run_id=run_id, required_memory=required_memory) + # --mem-per-cpu is half of required_memory because we allocate whole 2 cpus per run to prevent from hyperthreading + cmd = (srun_cmd + [f"--mem-per-cpu={int(required_memory / 2) + 1}", f"--job-name='garli {working_dir.name} {run_id}'", f"--output={output_dir.joinpath(run_id).with_suffix('.stdout')}", f"--error={output_dir.joinpath(run_id).with_suffix('.stderr')}"] + + [garli_cmd, str(conf_file)]) + with output_dir.joinpath("commands.txt").open("a") as commands_txt: + print(" ".join(cmd), file=commands_txt) + # print(cmd) + subprocess.check_call(cmd) + + # ---------------------------------------------------------------------- + + def submit(): + required_memory = find_memory_requirements() + failures = 0 + with concurrent.futures.ThreadPoolExecutor() as executor: + future_to_runid = {executor.submit(run_garli, run_id, required_memory): run_id for run_id in (f"{ri:03d}" for ri in range(num_runs))} + for future in concurrent.futures.as_completed(future_to_runid): + try: + future.result() + except Exception as err: + logging.error(f"garli {working_dir.name} {future_to_runid[future]} FAILED: {err}") + failures += 1 + else: + logging.info(f"garli run {working_dir.name} {future_to_runid[future]} completed") + if failures: + logging.error(f"garli {working_dir.name} FAILED: {failures} runs failed") + else: + logging.info("garli completed") + + # ---------------------------------------------------------------------- + + def results_available(): + return len(list(output_dir.glob("*.best.phy"))) > (num_runs / 2) + + # ---------------------------------------------------------------------- + + sReFinalScore = re.compile(r"Final\t-(\d+\.\d+)") + + def find_best_result(): + # scores in results are positive numbers + results = [[log_file_name, float(sReFinalScore.search(log_file_name.open().read()).group(1))] for log_file_name in output_dir.glob("*.log00.log")] + if not results: + raise RuntimeError(f"no results found in {output_dir}/*.log00.log") + results.sort(key=lambda en: en[1]) + output_dir.parent.joinpath("garli-best.txt").open("w").write(str(results[0][0]).replace(".log00.log", ".best.phy")) + output_dir.parent.joinpath("garli-all.txt").open("w").write("\n".join(f"{score:13.6f} {str(log_name).replace('.log00.log', '.best.phy')}" for log_name, score in results)) + return [Path(str(results[0][0]).replace(".log00.log", ".best.phy")), results[0][1]] + + # ---------------------------------------------------------------------- + + if not results_available(): + start = datetime.datetime.now() + submit() + logging.info(f"[{working_dir}]: garli elapsed: {datetime.datetime.now() - start}") + best_tree_file, best_score = find_best_result() + logging.info(f"[{working_dir}]: garli best: {best_tree_file} score: {best_score}") + return best_tree_file + +# ---------------------------------------------------------------------- + +GARLI_CONF = """\ +[general] +datafname = {source} +streefname = {streefname} +# Prefix of all output filenames, such as log, treelog, etc. Change +# this for each run that you do or the program will overwrite previous +# results. +ofprefix = {output_prefix} + +constraintfile = none + +# (1 to infinity) The number of attachment branches evaluated for each +# taxon to be added to the tree during the creation of an ML +# stepwise-addition starting tree. Briefly, stepwise addition is an +# algorithm used to make a tree, and involves adding taxa in a random +# order to a growing tree. For each taxon to be added, a number of +# randomly chosen attachment branches are tried and scored, and then +# the best scoring one is chosen as the location of that taxon. The +# attachmentspertaxon setting controls how many attachment points are +# evaluated for each taxon to be added. A value of one is equivalent +# to a completely random tree (only one randomly chosen location is +# evaluated). A value of greater than 2 times the number of taxa in +# the dataset means that all attachment points will be evaluated for +# each taxon, and will result in very good starting trees (but may +# take a while on large datasets). Even fairly small values (< 10) can +# result in starting trees that are much, much better than random, but +# still fairly different from one another. Default: 50. +attachmentspertaxon = {attachmentspertaxon} + +# -1 - random seed chosen automatically +randseed = {randseed} + +# in Mb +availablememory = {availablememory} + +# The frequency with which the best score is written to the log file, default: 10 +logevery = 1000 + +# Whether to write three files to disk containing all information +# about the current state of the population every saveevery +# generations, with each successive checkpoint overwriting the +# previous one. These files can be used to restart a run at the last +# written checkpoint by setting the restart configuration entry. Default: 0 +writecheckpoints = 0 + +# Whether to restart at a previously saved checkpoint. To use this +# option the writecheckpoints option must have been used during a +# previous run. The program will look for checkpoint files that are +# named based on the ofprefix of the previous run. If you intend to +# restart a run, NOTHING should be changed in the config file except +# setting restart to 1. A run that is restarted from checkpoint will +# give exactly the same results it would have if the run had gone to +# completion. Default: 0 +restart = 0 + +# If writecheckpoints or outputcurrentbesttopology are specified, this +# is the frequency (in generations) at which checkpoints or the +# current best tree are written to file. default: 100 +saveevery = 1000 + +# Specifies whether some initial rough optimization is performed on +# the starting branch lengths and alpha parameter. This is always +# recommended. Default: 1. +refinestart = 1 + +# If true, the current best tree of the current search replicate is +# written to .best.current.tre every saveevery +# generations. In versions before 0.96 the current best topology was +# always written to file, but that is no longer the case. Seeing the +# current best tree has no real use apart from satisfying your +# curiosity about how a run is going. Default: 0. +outputcurrentbesttopology = 0 + +# If true, each new topology encountered with a better score than the +# previous best is written to file. In some cases this can result in +# really big files (hundreds of MB) though, especially for random +# starting topologies on large datasets. Note that this file is +# interesting to get an idea of how the topology changed as the +# searches progressed, but the collection of trees should NOT be +# interpreted in any meaningful way. This option is not available +# while bootstrapping. Default: 0. +outputeachbettertopology = 0 + +# Specifies whether the automatic termination conditions will be +# used. The conditions specified by both of the following two +# parameters must be met. See the following two parameters for their +# definitions. If this is false, the run will continue until it +# reaches the time (stoptime) or generation (stopgen) limit. It is +# highly recommended that this option be used! Default: 1. +enforcetermconditions = 1 + +# This specifies the first part of the termination condition. When no +# new significantly better scoring topology (see significanttopochange +# below) has been encountered in greater than this number of +# generations, this condition is met. Increasing this parameter may +# improve the lnL scores obtained (especially on large datasets), but +# will also increase runtimes. Default: 20000. +genthreshfortopoterm = {genthreshfortopoterm} + +# The second part of the termination condition. When the total +# improvement in score over the last intervallength x intervalstostore +# generations (default is 500 generations, see below) is less than +# this value, this condition is met. This does not usually need to be +# changed. Default: 0.05 +scorethreshforterm = 0.05 + +# The lnL increase required for a new topology to be considered +# significant as far as the termination condition is concerned. It +# probably doesn’t need to be played with, but you might try +# increasing it slightly if your runs reach a stable score and then +# take a very long time to terminate due to very minor changes in +# topology. Default: 0.01 +significanttopochange = 0.01 + +# Whether a phylip formatted tree files will be output in addition to +# the default nexus files for the best tree across all replicates +# (.best.phy), the best tree for each replicate +# (.best.all.phy) or in the case of bootstrapping, the best +# tree for each bootstrap replicate (). +# We use .phy (it's newick tree format), use 1 here! +outputphyliptree = 1 + +# Whether to output three files of little general interest: the +# “fate”, “problog” and “swaplog” files. The fate file shows the +# parentage, mutation types and scores of every individual in the +# population during the entire search. The problog shows how the +# proportions of the different mutation types changed over the course +# of the run. The swaplog shows the number of unique swaps and the +# number of total swaps on the current best tree over the course of +# the run. Default: 0 +outputmostlyuselessfiles = 0 + +# This option allow for orienting the tree topologies in a consistent +# way when they are written to file. Note that this has NO effect +# whatsoever on the actual inference and the specified outgroup is NOT +# constrained to be present in the inferred trees. If multiple +# outgroup taxa are specified and they do not form a monophyletic +# group, this setting will be ignored. If you specify a single +# outgroup taxon it will always be present, and the tree will always +# be consistently oriented. To specify an outgroup consisting of taxa +# 1, 3 and 5 the format is this: outgroup = 1 3 5 +outgroup = {outgroup} + +resampleproportion = 1.0 +inferinternalstateprobs = 0 +outputsitelikelihoods = 0 +optimizeinputonly = 0 +collapsebranches = 1 + +# The number of independent search replicates to perform during a +# program execution. You should always either do multiple search +# replicates or multiple program executions with any dataset to get a +# feel for whether you are getting consistent results, which suggests +# that the program is doing a decent job of searching. Note that if +# this is > 1 and you are performing a bootstrap analysis, this is the +# number of search replicates to be done per bootstrap replicate. That +# can increase the chance of finding the best tree per bootstrap +# replicate, but will also increase bootstrap runtimes +# enormously. Default: 2 +searchreps = {searchreps} + +bootstrapreps = 0 + +# ------------------- FOR NUCLEOTIDES -------------- +datatype=nucleotide + +# The number of relative substitution rate parameters (note that the +# number of free parameters is this value minus one). Equivalent to +# the “nst” setting in PAUP* and MrBayes. 1rate assumes that +# substitutions between all pairs of nucleotides occur at the same +# rate, 2rate allows different rates for transitions and +# transversions, and 6rate allows a different rate between each +# nucleotide pair. These rates are estimated unless the fixed option +# is chosen. New in version 0.96, parameters for any submodel of the +# GTR model may now be estimated. The format for specifying this is +# very similar to that used in the “rclass’ setting of PAUP*. Within +# parentheses, six letters are specified, with spaces between +# them. The six letters represent the rates of substitution between +# the six pairs of nucleotides, with the order being A-C, A-G, A-T, +# C-G, C-T and G-T. Letters within the parentheses that are the same +# mean that a single parameter is shared by multiple nucleotide +# pairs. For example, ratematrix = (a b a a b a) would specify the HKY +# 2-rate model (equivalent to ratematrix = 2rate). This entry, +# ratematrix = (a b c c b a) would specify 3 estimated rates of +# subtitution, with one rate shared by A-C and G-T substitutions, +# another rate shared by A-G and C-T substitutions, and the final rate +# shared by A-T and C-G substitutions. Default: 6rate +ratematrix = 6rate + +# (equal, empirical, estimate, fixed) Specifies how the equilibrium +# state frequencies (A, C, G and T) are treated. The empirical setting +# fixes the frequencies at their observed proportions, and the other +# options should be self-explanatory. Default: estimate +statefrequencies = estimate + +# (none, gamma, gammafixed) – The model of rate heterogeneity +# assumed. “gammafixed” requires that the alpha shape parameter is +# provided, and a setting of “gamma” estimates it. Default: gamma +ratehetmodel = gamma + +# (1 to 20) – The number of categories of variable rates (not +# including the invariant site class if it is being used). Must be set +# to 1 if ratehetmodel is set to none. Note that runtimes and memory +# usage scale linearly with this setting. Default: 4 +numratecats = 4 + +# (none, estimate, fixed) Specifies whether a parameter representing +# the proportion of sites that are unable to change (i.e. have a +# substitution rate of zero) will be included. This is typically +# referred to as “invariant sites”, but would better be termed +# “invariable sites”. Default: estimate +invariantsites = estimate + +# ---------------------------------------------------------------------- + +[master] +nindivs = 4 +holdover = 1 +selectionintensity = .5 +holdoverpenalty = 0 + +# The maximum number of generations to run. Note that this supersedes +# the automated stopping criterion (see enforcetermconditions above), +# and should therefore be set to a very large value if automatic +# termination is desired. +stopgen = 1000000 + +# The maximum number of seconds for the run to continue. Note that +# this supersedes the automated stopping criterion (see +# enforcetermconditions above), and should therefore be set to a very +# large value if automatic termination is desired. +stoptime = {stoptime} + +startoptprec = 0.5 +minoptprec = 0.01 +numberofprecreductions = 20 +treerejectionthreshold = 50.0 +topoweight = 1.0 +modweight = 0.05 +brlenweight = 0.2 +randnniweight = 0.1 +randsprweight = 0.3 +limsprweight = 0.6 +intervallength = 100 +intervalstostore = 5 + +limsprrange = 6 +meanbrlenmuts = 5 +gammashapebrlen = 1000 +gammashapemodel = 1000 +uniqueswapbias = 0.1 +distanceswapbias = 1.0 +""" + +# ---------------------------------------------------------------------- + +def get_output_dir(working_dir: Path, prefix: str, new_output: bool): + if not new_output and (output_dirs := list(working_dir.glob(prefix + "*"))): + output_dir = max(output_dirs) + else: + output_dir = working_dir.joinpath(prefix + datetime.datetime.now().strftime('%Y-%m%d-%H%M%S')) + output_dir.mkdir() + output_dir.chmod(0o777) + return output_dir + +# ====================================================================== + +try: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("subdir") + parser.add_argument("-n", "--num-runs", type=int, default=16) + parser.add_argument("-s", "--stoptime", default=60*60, type=int, help="termination condition: the maximum number of seconds for the run to continue") + parser.add_argument("-t", "--start-tree", default=None, type=Path) + parser.add_argument("-f", "--source-fasta", default=None, type=Path) + parser.add_argument("--new-output", action="store_true", default=False) + parser.add_argument("--node", dest="slurm_nodelist", default="i22", help="one node to run on, to run on multiple nodes use --exclude-nodes") + parser.add_argument("--exclude-nodes", dest="slurm_exclude_nodes", default=None) + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG, format="%(levelname)s %(asctime)s: %(message)s") + exit_code = main(args) or 0 +except Exception as err: + logging.error(f"> {err}\n{traceback.format_exc()}") + exit_code = 1 +exit(exit_code) + +# ====================================================================== diff --git a/proj/weekly-tree/garli-memvar b/proj/weekly-tree/garli-memvar new file mode 100755 index 0000000..35dd4ee --- /dev/null +++ b/proj/weekly-tree/garli-memvar @@ -0,0 +1,452 @@ +#! /usr/bin/env python3 + +import sys, os, re, argparse, logging, time, datetime, concurrent.futures, subprocess, random, traceback +from pathlib import Path + +# ====================================================================== + +sSubtypeNorm = {"BV": "BVIC", "BVIC": "BVIC", "BY": "BYAM", "BYAM": "BYAM", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": "h3"} + +sRandomGen = random.SystemRandom() + +# ---------------------------------------------------------------------- + +def main(args: argparse.Namespace): + argv = '"' + '" "'.join(sys.argv) + '"' + working_dir = Path(args.subdir).resolve() + if args.start_tree: + tree = args.start_tree.resolve() + else: + tree = Path(working_dir.joinpath("raxml-best.txt").open().read().strip()).resolve() + if args.source_fasta: + source_fasta = args.source_fasta.resolve() + else: + source_fasta = working_dir.joinpath("source.fas").resolve() + try: + best_tree_file = garli(working_dir=working_dir, tree=tree, source_fas=source_fasta, num_runs=args.num_runs, stoptime=args.stoptime, slurm_nodelist=args.slurm_nodelist, slurm_exclude_nodes=args.slurm_exclude_nodes, new_output=args.new_output) + subprocess.run(["/usr/bin/mail", "-s", f"completed {sys.argv[0]} {working_dir}", "weekly-tree@antigenic-cartography.org"], input=f"cd {working_dir}\n{argv}\n{best_tree_file}\n", text=True) + return 0 + except Exception as err: + subprocess.run(["/usr/bin/mail", "-s", f"FAILED {sys.argv[0]} {working_dir}", "weekly-tree@antigenic-cartography.org"], input=f"cd {working_dir}\n{argv}\n\n{traceback.format_exc()}\n", text=True) + raise + +# ---------------------------------------------------------------------- + +def garli(working_dir: Path, tree: Path, source_fas: Path, num_runs: int, stoptime: int, slurm_nodelist: str, slurm_exclude_nodes: str, new_output: bool): + + default_required_memory = 15800 + + # the number of attachment branches evaluated for each taxon to be added to the tree during the creation of an ML stepwise-addition starting tree (when garli is run without raxml step) + attachmentspertaxon = 1000000 + # termination condition: when no new significantly better scoring topology has been encountered in greater than this number of generations, garli stops + genthreshfortopoterm = 20000 + + output_dir = get_output_dir(working_dir, "garli.", new_output=new_output) + + garli_cmd = "/syn/bin/garli" + srun_cmd = [ + "srun", + "--cpus-per-task=2", + # "--nodes=1", + "--ntasks=1", + "--threads=1", + f"--nodelist={slurm_nodelist}", + ] + if slurm_exclude_nodes: + srun_cmd.append(f"--exclude={slurm_exclude_nodes}") + + # ---------------------------------------------------------------------- + + def make_conf(run_id, required_memory): + """Returns filename of the written conf file""" + # if not outgroup or not isinstance(outgroup, list) or not all(isinstance(e, int) and e > 0 for e in outgroup): + # raise ValueError("outgroup must be non-empty list of taxa indices in the fasta file starting with 1") + garli_args = { + "source": str(source_fas.resolve()), + "availablememory": required_memory or default_required_memory, + "streefname": str(tree.resolve()), + "output_prefix": str(output_dir.joinpath(run_id)), + "attachmentspertaxon": attachmentspertaxon, + "randseed": sRandomGen.randint(1, 0xFFFFFFF), + "genthreshfortopoterm": genthreshfortopoterm, + "searchreps": 1, + "stoptime": stoptime, + "outgroup": "1" # " ".join(str(e) for e in outgroup), + } + conf = GARLI_CONF.format(**garli_args) + conf_filename = output_dir.joinpath(run_id + ".garli.conf") + with conf_filename.open("w") as f: + f.write(conf) + return conf_filename + + # ---------------------------------------------------------------------- + + def find_memory_requirements(): + return default_required_memory + +# sGreat = re.compile(r"\s*great\s+>=\s+(\d+)\s+MB", re.I) +# conf_file = make_conf(run_id="find_memory_requirements", required_memory=None) +# start = time.time() +# proc = subprocess.Popen([garli_cmd, str(conf_file)], stdin=subprocess.DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) +# required_memory = None # default, in case the code below fails or times out +# timeout = 300 +# while required_memory is None: +# line = proc.stdout.readline().decode("utf-8") +# if mm := sGreat.match(line): +# required_memory = int(mm.group(1)) +# elif (time.time() - start) > timeout: +# required_memory = default_required_memory +# print(f"WARNING: Cannot obtain memory required by garli in {timeout} seconds: default for {working_dir.name}: {required_memory} will be used", file=sys.stderr) +# proc.kill() +# return required_memory + + # ---------------------------------------------------------------------- + + def run_garli(run_id, required_memory): + conf_file = make_conf(run_id=run_id, required_memory=required_memory) + # --mem-per-cpu is half of required_memory because we allocate whole 2 cpus per run to prevent from hyperthreading + cmd = (srun_cmd + [f"--mem-per-cpu={int(required_memory / 2) + 1}", f"--job-name='garli {working_dir.name} {run_id}'", f"--output={output_dir.joinpath(run_id).with_suffix('.stdout')}", f"--error={output_dir.joinpath(run_id).with_suffix('.stderr')}"] + + [garli_cmd, str(conf_file)]) + with output_dir.joinpath("commands.txt").open("a") as commands_txt: + print(" ".join(cmd), file=commands_txt) + # print(cmd) + subprocess.check_call(cmd) + + # ---------------------------------------------------------------------- + + def submit(): + required_memory = find_memory_requirements() + failures = 0 + with concurrent.futures.ThreadPoolExecutor() as executor: + future_to_runid = {executor.submit(run_garli, run_id, required_memory): run_id for run_id in (f"{ri:03d}" for ri in range(num_runs))} + for future in concurrent.futures.as_completed(future_to_runid): + try: + future.result() + except Exception as err: + logging.error(f"garli {working_dir.name} {future_to_runid[future]} FAILED: {err}") + failures += 1 + else: + logging.info(f"garli run {working_dir.name} {future_to_runid[future]} completed") + if failures: + logging.error(f"garli {working_dir.name} FAILED: {failures} runs failed") + else: + logging.info("garli completed") + + # ---------------------------------------------------------------------- + + def results_available(): + return len(list(output_dir.glob("*.best.phy"))) > (num_runs / 2) + + # ---------------------------------------------------------------------- + + sReFinalScore = re.compile(r"Final\t-(\d+\.\d+)") + + def find_best_result(): + # scores in results are positive numbers + results = [[log_file_name, float(sReFinalScore.search(log_file_name.open().read()).group(1))] for log_file_name in output_dir.glob("*.log00.log")] + if not results: + raise RuntimeError(f"no results found in {output_dir}/*.log00.log") + results.sort(key=lambda en: en[1]) + output_dir.parent.joinpath("garli-best.txt").open("w").write(str(results[0][0]).replace(".log00.log", ".best.phy")) + output_dir.parent.joinpath("garli-all.txt").open("w").write("\n".join(f"{score:13.6f} {str(log_name).replace('.log00.log', '.best.phy')}" for log_name, score in results)) + return [Path(str(results[0][0]).replace(".log00.log", ".best.phy")), results[0][1]] + + # ---------------------------------------------------------------------- + + if not results_available(): + start = datetime.datetime.now() + submit() + logging.info(f"[{working_dir}]: garli elapsed: {datetime.datetime.now() - start}") + best_tree_file, best_score = find_best_result() + logging.info(f"[{working_dir}]: garli best: {best_tree_file} score: {best_score}") + return best_tree_file + +# ---------------------------------------------------------------------- + +GARLI_CONF = """\ +[general] +datafname = {source} +streefname = {streefname} +# Prefix of all output filenames, such as log, treelog, etc. Change +# this for each run that you do or the program will overwrite previous +# results. +ofprefix = {output_prefix} + +constraintfile = none + +# (1 to infinity) The number of attachment branches evaluated for each +# taxon to be added to the tree during the creation of an ML +# stepwise-addition starting tree. Briefly, stepwise addition is an +# algorithm used to make a tree, and involves adding taxa in a random +# order to a growing tree. For each taxon to be added, a number of +# randomly chosen attachment branches are tried and scored, and then +# the best scoring one is chosen as the location of that taxon. The +# attachmentspertaxon setting controls how many attachment points are +# evaluated for each taxon to be added. A value of one is equivalent +# to a completely random tree (only one randomly chosen location is +# evaluated). A value of greater than 2 times the number of taxa in +# the dataset means that all attachment points will be evaluated for +# each taxon, and will result in very good starting trees (but may +# take a while on large datasets). Even fairly small values (< 10) can +# result in starting trees that are much, much better than random, but +# still fairly different from one another. Default: 50. +attachmentspertaxon = {attachmentspertaxon} + +# -1 - random seed chosen automatically +randseed = {randseed} + +# in Mb +availablememory = {availablememory} + +# The frequency with which the best score is written to the log file, default: 10 +logevery = 1000 + +# Whether to write three files to disk containing all information +# about the current state of the population every saveevery +# generations, with each successive checkpoint overwriting the +# previous one. These files can be used to restart a run at the last +# written checkpoint by setting the restart configuration entry. Default: 0 +writecheckpoints = 0 + +# Whether to restart at a previously saved checkpoint. To use this +# option the writecheckpoints option must have been used during a +# previous run. The program will look for checkpoint files that are +# named based on the ofprefix of the previous run. If you intend to +# restart a run, NOTHING should be changed in the config file except +# setting restart to 1. A run that is restarted from checkpoint will +# give exactly the same results it would have if the run had gone to +# completion. Default: 0 +restart = 0 + +# If writecheckpoints or outputcurrentbesttopology are specified, this +# is the frequency (in generations) at which checkpoints or the +# current best tree are written to file. default: 100 +saveevery = 1000 + +# Specifies whether some initial rough optimization is performed on +# the starting branch lengths and alpha parameter. This is always +# recommended. Default: 1. +refinestart = 1 + +# If true, the current best tree of the current search replicate is +# written to .best.current.tre every saveevery +# generations. In versions before 0.96 the current best topology was +# always written to file, but that is no longer the case. Seeing the +# current best tree has no real use apart from satisfying your +# curiosity about how a run is going. Default: 0. +outputcurrentbesttopology = 0 + +# If true, each new topology encountered with a better score than the +# previous best is written to file. In some cases this can result in +# really big files (hundreds of MB) though, especially for random +# starting topologies on large datasets. Note that this file is +# interesting to get an idea of how the topology changed as the +# searches progressed, but the collection of trees should NOT be +# interpreted in any meaningful way. This option is not available +# while bootstrapping. Default: 0. +outputeachbettertopology = 0 + +# Specifies whether the automatic termination conditions will be +# used. The conditions specified by both of the following two +# parameters must be met. See the following two parameters for their +# definitions. If this is false, the run will continue until it +# reaches the time (stoptime) or generation (stopgen) limit. It is +# highly recommended that this option be used! Default: 1. +enforcetermconditions = 1 + +# This specifies the first part of the termination condition. When no +# new significantly better scoring topology (see significanttopochange +# below) has been encountered in greater than this number of +# generations, this condition is met. Increasing this parameter may +# improve the lnL scores obtained (especially on large datasets), but +# will also increase runtimes. Default: 20000. +genthreshfortopoterm = {genthreshfortopoterm} + +# The second part of the termination condition. When the total +# improvement in score over the last intervallength x intervalstostore +# generations (default is 500 generations, see below) is less than +# this value, this condition is met. This does not usually need to be +# changed. Default: 0.05 +scorethreshforterm = 0.05 + +# The lnL increase required for a new topology to be considered +# significant as far as the termination condition is concerned. It +# probably doesn’t need to be played with, but you might try +# increasing it slightly if your runs reach a stable score and then +# take a very long time to terminate due to very minor changes in +# topology. Default: 0.01 +significanttopochange = 0.01 + +# Whether a phylip formatted tree files will be output in addition to +# the default nexus files for the best tree across all replicates +# (.best.phy), the best tree for each replicate +# (.best.all.phy) or in the case of bootstrapping, the best +# tree for each bootstrap replicate (). +# We use .phy (it's newick tree format), use 1 here! +outputphyliptree = 1 + +# Whether to output three files of little general interest: the +# “fate”, “problog” and “swaplog” files. The fate file shows the +# parentage, mutation types and scores of every individual in the +# population during the entire search. The problog shows how the +# proportions of the different mutation types changed over the course +# of the run. The swaplog shows the number of unique swaps and the +# number of total swaps on the current best tree over the course of +# the run. Default: 0 +outputmostlyuselessfiles = 0 + +# This option allow for orienting the tree topologies in a consistent +# way when they are written to file. Note that this has NO effect +# whatsoever on the actual inference and the specified outgroup is NOT +# constrained to be present in the inferred trees. If multiple +# outgroup taxa are specified and they do not form a monophyletic +# group, this setting will be ignored. If you specify a single +# outgroup taxon it will always be present, and the tree will always +# be consistently oriented. To specify an outgroup consisting of taxa +# 1, 3 and 5 the format is this: outgroup = 1 3 5 +outgroup = {outgroup} + +resampleproportion = 1.0 +inferinternalstateprobs = 0 +outputsitelikelihoods = 0 +optimizeinputonly = 0 +collapsebranches = 1 + +# The number of independent search replicates to perform during a +# program execution. You should always either do multiple search +# replicates or multiple program executions with any dataset to get a +# feel for whether you are getting consistent results, which suggests +# that the program is doing a decent job of searching. Note that if +# this is > 1 and you are performing a bootstrap analysis, this is the +# number of search replicates to be done per bootstrap replicate. That +# can increase the chance of finding the best tree per bootstrap +# replicate, but will also increase bootstrap runtimes +# enormously. Default: 2 +searchreps = {searchreps} + +bootstrapreps = 0 + +# ------------------- FOR NUCLEOTIDES -------------- +datatype=nucleotide + +# The number of relative substitution rate parameters (note that the +# number of free parameters is this value minus one). Equivalent to +# the “nst” setting in PAUP* and MrBayes. 1rate assumes that +# substitutions between all pairs of nucleotides occur at the same +# rate, 2rate allows different rates for transitions and +# transversions, and 6rate allows a different rate between each +# nucleotide pair. These rates are estimated unless the fixed option +# is chosen. New in version 0.96, parameters for any submodel of the +# GTR model may now be estimated. The format for specifying this is +# very similar to that used in the “rclass’ setting of PAUP*. Within +# parentheses, six letters are specified, with spaces between +# them. The six letters represent the rates of substitution between +# the six pairs of nucleotides, with the order being A-C, A-G, A-T, +# C-G, C-T and G-T. Letters within the parentheses that are the same +# mean that a single parameter is shared by multiple nucleotide +# pairs. For example, ratematrix = (a b a a b a) would specify the HKY +# 2-rate model (equivalent to ratematrix = 2rate). This entry, +# ratematrix = (a b c c b a) would specify 3 estimated rates of +# subtitution, with one rate shared by A-C and G-T substitutions, +# another rate shared by A-G and C-T substitutions, and the final rate +# shared by A-T and C-G substitutions. Default: 6rate +ratematrix = 6rate + +# (equal, empirical, estimate, fixed) Specifies how the equilibrium +# state frequencies (A, C, G and T) are treated. The empirical setting +# fixes the frequencies at their observed proportions, and the other +# options should be self-explanatory. Default: estimate +statefrequencies = estimate + +# (none, gamma, gammafixed) – The model of rate heterogeneity +# assumed. “gammafixed” requires that the alpha shape parameter is +# provided, and a setting of “gamma” estimates it. Default: gamma +ratehetmodel = gamma + +# (1 to 20) – The number of categories of variable rates (not +# including the invariant site class if it is being used). Must be set +# to 1 if ratehetmodel is set to none. Note that runtimes and memory +# usage scale linearly with this setting. Default: 4 +numratecats = 4 + +# (none, estimate, fixed) Specifies whether a parameter representing +# the proportion of sites that are unable to change (i.e. have a +# substitution rate of zero) will be included. This is typically +# referred to as “invariant sites”, but would better be termed +# “invariable sites”. Default: estimate +invariantsites = estimate + +# ---------------------------------------------------------------------- + +[master] +nindivs = 4 +holdover = 1 +selectionintensity = .5 +holdoverpenalty = 0 + +# The maximum number of generations to run. Note that this supersedes +# the automated stopping criterion (see enforcetermconditions above), +# and should therefore be set to a very large value if automatic +# termination is desired. +stopgen = 1000000 + +# The maximum number of seconds for the run to continue. Note that +# this supersedes the automated stopping criterion (see +# enforcetermconditions above), and should therefore be set to a very +# large value if automatic termination is desired. +stoptime = {stoptime} + +startoptprec = 0.5 +minoptprec = 0.01 +numberofprecreductions = 20 +treerejectionthreshold = 50.0 +topoweight = 1.0 +modweight = 0.05 +brlenweight = 0.2 +randnniweight = 0.1 +randsprweight = 0.3 +limsprweight = 0.6 +intervallength = 100 +intervalstostore = 5 + +limsprrange = 6 +meanbrlenmuts = 5 +gammashapebrlen = 1000 +gammashapemodel = 1000 +uniqueswapbias = 0.1 +distanceswapbias = 1.0 +""" + +# ---------------------------------------------------------------------- + +def get_output_dir(working_dir: Path, prefix: str, new_output: bool): + if not new_output and (output_dirs := list(working_dir.glob(prefix + "*"))): + output_dir = max(output_dirs) + else: + output_dir = working_dir.joinpath(prefix + datetime.datetime.now().strftime('%Y-%m%d-%H%M%S')) + output_dir.mkdir() + output_dir.chmod(0o777) + return output_dir + +# ====================================================================== + +try: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("subdir") + parser.add_argument("-n", "--num-runs", type=int, default=16) + parser.add_argument("-s", "--stoptime", default=60*60, type=int, help="termination condition: the maximum number of seconds for the run to continue") + parser.add_argument("-t", "--start-tree", default=None, type=Path) + parser.add_argument("-f", "--source-fasta", default=None, type=Path) + parser.add_argument("--new-output", action="store_true", default=False) + parser.add_argument("--node", dest="slurm_nodelist", default="i22", help="one node to run on, to run on multiple nodes use --exclude-nodes") + parser.add_argument("--exclude-nodes", dest="slurm_exclude_nodes", default=None) + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG, format="%(levelname)s %(asctime)s: %(message)s") + exit_code = main(args) or 0 +except Exception as err: + logging.error(f"> {err}\n{traceback.format_exc()}") + exit_code = 1 +exit(exit_code) + +# ====================================================================== diff --git a/proj/weekly-tree/init b/proj/weekly-tree/init index 8939b6e..a8dc0f1 100755 --- a/proj/weekly-tree/init +++ b/proj/weekly-tree/init @@ -31,11 +31,11 @@ def init_subtype(subtype: str, previous_dir: Path): subtype_dir = TODAY_DIR.joinpath(subtype) subtype_dir.mkdir(exist_ok=True) if not (previous_newick := subtype_dir.joinpath("previous.newick")).exists(): - previous_raxml_best = previous_dir.joinpath(subtype, "raxml-best.txt") - if not previous_raxml_best.exists(): - print(f">> no previous raxml best run report: {previous_raxml_best}", file=sys.stderr) + previous_tree = previous_dir.joinpath(subtype, "cmaple_tree.txt") + if not previous_tree.exists(): + print(f">> no previous tree run report: {previous_tree}", file=sys.stderr) return - previous_newick_link_target = Path(previous_raxml_best.open().read().strip()) + previous_newick_link_target = Path(previous_tree.open().read().strip()) link = relative_path(previous_newick_link_target, subtype_dir) print(f">>> {previous_newick} -> {link}", file=sys.stderr) previous_newick.symlink_to(link) diff --git a/proj/weekly-tree/init-h3 b/proj/weekly-tree/init-h3 new file mode 100755 index 0000000..36d521e --- /dev/null +++ b/proj/weekly-tree/init-h3 @@ -0,0 +1,93 @@ +#! /usr/bin/env python3 +import sys, os, datetime, subprocess +from pathlib import Path + +WEEKLY_TREE_ROOT_DIR = Path("/syn/eu/ac/results/trees") +TODAY = datetime.date.today().strftime("%Y-%m%d") +TODAY_DIR = WEEKLY_TREE_ROOT_DIR.joinpath(TODAY) +SUBTYPES = ["h3"] + +CMD_DIR = Path(sys.argv[0]).parent +EXPORT_CMD = CMD_DIR.joinpath("export") +UPDATE_TREE_CMD = CMD_DIR.joinpath("update-tree") + +# ====================================================================== + +def main(): + previous_dir = find_previous_dir() + print(f">>> previous dir: {previous_dir}", file=sys.stderr) + init_today_dir(previous_dir) + +# ---------------------------------------------------------------------- + +def init_today_dir(previous_dir: Path): + TODAY_DIR.mkdir(exist_ok=True) + for subtype in SUBTYPES: + init_subtype(subtype, previous_dir=previous_dir) + +# ---------------------------------------------------------------------- + +def init_subtype(subtype: str, previous_dir: Path): + subtype_dir = TODAY_DIR.joinpath(subtype) + subtype_dir.mkdir(exist_ok=True) + if not (previous_newick := subtype_dir.joinpath("previous.newick")).exists(): + previous_raxml_best = previous_dir.joinpath(subtype, "raxml-best.txt") + if not previous_raxml_best.exists(): + print(f">> no previous raxml best run report: {previous_raxml_best}", file=sys.stderr) + return + previous_newick_link_target = Path(previous_raxml_best.open().read().strip()) + link = relative_path(previous_newick_link_target, subtype_dir) + print(f">>> {previous_newick} -> {link}", file=sys.stderr) + previous_newick.symlink_to(link) + + if not (source_fas := subtype_dir.joinpath("source.fas")).exists(): + export_source_fas(subtype=subtype, source_fas=source_fas, previous_dir=previous_dir) + + if not (source_newick := subtype_dir.joinpath("source.newick")).exists(): + update_source_newick(subtype=subtype, source_newick=source_newick, previous_dir=previous_dir) + +# ---------------------------------------------------------------------- + +def update_source_newick(subtype: str, source_newick: Path, previous_dir: Path): + print(f">>> updating {source_newick}", file=sys.stderr) + log = subprocess.check_output([UPDATE_TREE_CMD, subtype], cwd=source_newick.parents[1]).decode("utf-8") + source_newick.with_name("update-tree.log").open("w").write(log) + print(log) + +# ---------------------------------------------------------------------- + +def export_source_fas(subtype: str, source_fas: Path, previous_dir: Path): + print(f">>> exporting {source_fas}", file=sys.stderr) + subprocess.check_call([EXPORT_CMD, subtype], cwd=source_fas.parents[1]) + subprocess.check_call(["fasta-size", source_fas, previous_dir.joinpath(subtype, source_fas.name)]) + +# ---------------------------------------------------------------------- + +def relative_path(target: Path, source_dir: Path): + # print(target.parts, source_dir.parts) + for initial_parts in range(min(len(target.parts), len(source_dir.parts))): + # print(initial_parts, target.parts[initial_parts], source_dir.parts[initial_parts], target.parts[initial_parts] == source_dir.parts[initial_parts]) + if target.parts[initial_parts] != source_dir.parts[initial_parts]: + break + # print(initial_parts, target.parts[initial_parts:], source_dir.parts[initial_parts:]) + return Path(*([".."] * (len(source_dir.parts) - initial_parts)), *target.parts[initial_parts:]) + +# ---------------------------------------------------------------------- + +def find_previous_dir(): + dirs = list(WEEKLY_TREE_ROOT_DIR.glob("2*")) + found = None + use = len(dirs) - 1 + while not found and use >= 0: + use_date = dirs[use].stem[:9] + if use_date != TODAY: + found = dirs[use] + use -= 1 + if not found: + dirs_s = "\n ".join(str(dd) for dd in dirs) + raise RuntimeError(f"previous dir not found in\n {dirs_s}") + return found + +# ====================================================================== + +main() diff --git a/proj/weekly-tree/make-cmaple b/proj/weekly-tree/make-cmaple new file mode 100755 index 0000000..3509efe --- /dev/null +++ b/proj/weekly-tree/make-cmaple @@ -0,0 +1,143 @@ +#! /usr/bin/env python3 + +import sys, os, re, argparse, logging, datetime, concurrent.futures, subprocess, random, traceback +from pathlib import Path + +# ====================================================================== + +sSubtypeNorm = {"BV": "BVIC", "BVIC": "BVIC", "BY": "BYAM", "BYAM": "BYAM", "A(H1N1)": "A(H1N1)", "H1": "A(H1N1)", "1": "A(H1N1)", "A(H3N2)": "A(H3N2)", "H3": "A(H3N2)", "3": "A(H3N2)"} +sSubdirforSubtype = {"BVIC": "bvic", "BYAM": "byam", "A(H1N1)": "h1", "A(H3N2)": "h3"} + +sRandomGen = random.SystemRandom() + +sMailTo = "weekly-tree@antigenic-cartography.org" + +# ---------------------------------------------------------------------- + +def main(args: argparse.Namespace): + argv = '"' + '" "'.join(sys.argv) + '"' + working_dir = Path(args.subdir).resolve() + source_fas = working_dir.joinpath("source.fas") + # source_tree = working_dir.joinpath("source.newick") no source tree, we make from scratch each time + outgroup = source_fas.open().readline().strip()[1:] + try: + best_tree_file = cmaple(working_dir=working_dir, source_fas=source_fas, outgroup=outgroup, num_runs=args.num_runs, slurm_nodelist=args.slurm_nodelist, slurm_exclude_nodes=args.slurm_exclude_nodes) + subprocess.run(["/usr/bin/mail", "-s", f"completed {sys.argv[0]} {working_dir}", sMailTo], input=f"cd {working_dir}\n{argv}\n{best_tree_file}\n", text=True) + logging.info(f"mail to {sMailTo} about completion sent") + return 0 + except Exception as err: + logging.error(f"", exc_info=True) + subprocess.run(["/usr/bin/mail", "-s", f"FAILED {sys.argv[0]} {working_dir}", sMailTo], input=f"cd {working_dir}\n{argv}\n\n{traceback.format_exc()}\n", text=True) + logging.info(f"mail to {sMailTo} about FAILURE sent") + raise + +# ---------------------------------------------------------------------- + +def cmaple(working_dir: Path, source_fas: Path, outgroup: str, slurm_nodelist: str, slurm_exclude_nodes: str, num_runs: int): + + output_dir = get_output_dir(working_dir, "cmaple.") + + cmaple_cmd = [ + "/syn/bin/cmaple", + "-aln", str(source_fas), + "-st", "DNA", + "-m", "GTR", + "--search", "EXHAUSTIVE", + "--min-blength", "0.0001", + "-nt", "1", + "--overwrite" + ] + + # ---------------------------------------------------------------------- + + def prefix(run_id): + return f"{output_dir.joinpath(run_id)}" + + def run_cmaple(run_id): + cmd = (srun_cmd() + + [f"--job-name=vcm_cmaple {working_dir.name} {run_id}", f"--output={output_dir.joinpath(run_id).with_suffix('.stdout')}", f"--error={output_dir.joinpath(run_id).with_suffix('.stderr')}"] + + cmaple_cmd + + ["--prefix", prefix(run_id), "--seed", str(sRandomGen.randint(1, 0xFFFFFFF))]) + with output_dir.joinpath("commands.txt").open("a") as commands_txt: + print(" ".join(cmd), file=commands_txt) + # print(cmd) + subprocess.check_call(cmd) + + def srun_cmd(): + cmd = [ + "srun", + "--cpus-per-task=2", + "--ntasks=1", + "--threads=1", + f"--nodelist={slurm_nodelist}", + ] + if slurm_exclude_nodes: + cmd.append(f"--exclude={slurm_exclude_nodes}") + return cmd + + # ---------------------------------------------------------------------- + + def submit(): + failures = 0 + logging.info(f"submitting {num_runs} jobs") + with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor: + future_to_runid = {executor.submit(run_cmaple, run_id): run_id for run_id in (f"{ri:03d}" for ri in range(num_runs))} + for future in concurrent.futures.as_completed(future_to_runid): + try: + future.result() + except Exception as err: + logging.error(f"cmaple {working_dir} {future_to_runid[future]} FAILED: {err}") + failures += 1 + else: + logging.info(f"cmaple run {working_dir} {future_to_runid[future]} completed") + if failures: + logging.error(f"cmaple {working_dir} FAILED: {failures} runs failed") + else: + logging.info("cmaple completed") + + # ---------------------------------------------------------------------- + + def results_available(): + return len(list(output_dir.glob("*000.treefile"))) > 0 + + tree_file = output_dir.joinpath("000.treefile") + + if not results_available(): + start = datetime.datetime.now() + submit() + logging.info(f"cmaple elapsed: {datetime.datetime.now() - start}") + + output_dir.parent.joinpath("cmaple_tree.txt").open("w").write(str(tree_file)) + + return tree_file + +# ---------------------------------------------------------------------- + +def get_output_dir(working_dir, prefix): + output_dirs = list(working_dir.glob(prefix + "*")) + if output_dirs: + output_dir = max(output_dirs) + else: + output_dir = working_dir.joinpath(prefix + datetime.datetime.now().strftime('%Y-%m%d-%H%M%S')) + output_dir.mkdir() + output_dir.chmod(0o777) + return output_dir + +# ====================================================================== + +try: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("subdir") + # parser.add_argument("outgroup") + parser.add_argument("-n", "--num-runs", dest="num_runs", type=int, default=1) + parser.add_argument("--node", dest="slurm_nodelist", default="i22") + parser.add_argument("--exclude-nodes", dest="slurm_exclude_nodes", default=None) + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG, format="%(levelname)s %(asctime)s: %(message)s") + exit_code = main(args) or 0 +except Exception as err: + logging.error(f"> {err}\n{traceback.format_exc()}") + exit_code = 1 +exit(exit_code) + +# ====================================================================== diff --git a/proj/weekly-tree/pdf b/proj/weekly-tree/pdf index eacaa2f..f8d1fa5 100755 --- a/proj/weekly-tree/pdf +++ b/proj/weekly-tree/pdf @@ -18,7 +18,7 @@ if [[ ${HIDB_SUBTYPE:0:1} == "b" ]]; then HIDB_SUBTYPE=b; fi # ---------------------------------------------------------------------------------------------------- if [[ ! -f ${SUBTYPE}/${SUBTYPE}.tjz ]]; then - ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/garli-best.txt) ${SUBTYPE}/${SUBTYPE}.tjz + ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/cmaple_tree-rot-trim-col.txt) ${SUBTYPE}/${SUBTYPE}.tjz fi if [[ ! -f ${SUBTYPE}/${SUBTYPE}.pdf ]]; then tal -s ${TAL} ${SUBTYPE}/${SUBTYPE}.tjz ${SUBTYPE}/${SUBTYPE}.txt ${SUBTYPE}/${SUBTYPE}.pdf @@ -30,7 +30,7 @@ if [[ ${HIDB_SUBTYPE:0:1} == "h" ]]; then ASR_INFIX=".asr" if [[ ! -f ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz ]]; then ASR_STATES=$(cat ${SUBTYPE}/raxml-asr-best.txt | sed 's/ancestralTree/ancestralStates/') - ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --asr-tree $(cat ${SUBTYPE}/raxml-asr-best.txt) --asr-states ${ASR_STATES} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/garli-best.txt) ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz + ${AE_ROOT}/bin/tree-to-json --populate ${SUBTYPE} --asr-tree $(cat ${SUBTYPE}/raxml-asr-best.txt) --asr-states ${ASR_STATES} --ladderize --print-cumulative 20 --remove-if-cumulative-more-than ${CUMUL[${SUBTYPE}]} $(cat ${SUBTYPE}/cmaple_tree-rot-trim-col.txt) ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz fi if [[ ! -f ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.pdf ]]; then tal -s ${TAL} -D aa-transitions-method=imported ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.tjz ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.txt ${SUBTYPE}/${SUBTYPE}${ASR_INFIX}.pdf diff --git a/proj/weekly-tree/raxml-asr b/proj/weekly-tree/raxml-asr index 8baa059..6dea16c 100755 --- a/proj/weekly-tree/raxml-asr +++ b/proj/weekly-tree/raxml-asr @@ -10,8 +10,9 @@ sRandomGen = random.SystemRandom() def main(args: argparse.Namespace): argv = '"' + '" "'.join(sys.argv) + '"' working_dir = Path(args.subdir).resolve() - source_fas = working_dir.joinpath("source.fas") - source_tree = Path(working_dir.joinpath("raxml-best.txt").open().read().strip()).resolve() + source_fas = working_dir.joinpath("source-trim.fas") + source_tree = Path(working_dir.joinpath("cmaple_tree-rot-trim.txt").open().read().strip()).resolve() + if os.fork(): # parent exit(0) @@ -30,13 +31,25 @@ def raxml_ng_asr(working_dir: Path, source_fas: Path, source_tree: Path, slurm_n output_dir = get_output_dir(working_dir, "raxml-ng-asr.") + source_tree_with_min_branch_lengths = source_tree.parent.joinpath("000_minbl.treefile") + bl_script_path = os.environ["AE_ROOT"] + "/R/set_min_branch_lengths.R" + + set_min_bl_cmd = [ + bl_script_path, + str(source_tree.resolve()), + str(source_tree_with_min_branch_lengths.resolve()), + "0.0001" + ] + + # ---------------------------------------------------------------------- + raxml_cmd = [ "/syn/bin/raxml-ng", "--ancestral", "--model", "GTR+G+I", # raxml: -m GTRGAMMAI -c 4 "--msa", str(source_fas), "--msa-format", "FASTA", - "--tree", str(source_tree), + "--tree", str(source_tree_with_min_branch_lengths), "--log", "PROGRESS", # VERBOSE, DEBUG "--threads", "1", ] @@ -53,6 +66,7 @@ def raxml_ng_asr(working_dir: Path, source_fas: Path, source_tree: Path, slurm_n with output_dir.joinpath("commands.txt").open("a") as commands_txt: print(" ".join(cmd), file=commands_txt) # print(cmd) + subprocess.call(set_min_bl_cmd) subprocess.check_call(cmd) def srun_cmd(): diff --git a/proj/weekly-tree/submit b/proj/weekly-tree/submit index 4cb9be3..4c5fd51 100755 --- a/proj/weekly-tree/submit +++ b/proj/weekly-tree/submit @@ -1,3 +1,5 @@ + + #! /usr/bin/env bash set -o errexit -o errtrace -o pipefail -o nounset # -o xtrace @@ -6,13 +8,15 @@ submit_tree() NODE="$1" SUBTYPE="$2" ( - ${AE_ROOT}/proj/weekly-tree/raxml --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/make-cmaple --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/R/rotate.R ${SUBTYPE}/cmaple_tree.txt ${SUBTYPE}/source.fas && + ${AE_ROOT}/R/tree_trim.R ${SUBTYPE} ${SUBTYPE}/cmaple_tree-rot.txt ${SUBTYPE}/source.fas && ${AE_ROOT}/proj/weekly-tree/raxml-asr --node ${NODE} ${SUBTYPE} && - ${AE_ROOT}/proj/weekly-tree/garli --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/R/collapse_short_branches.R ${SUBTYPE}/cmaple_tree-rot-trim.txt 0.00005 && ${AE_ROOT}/proj/weekly-tree/pdf ${SUBTYPE} ) >${SUBTYPE}/submit.log 2>&1 & } submit_tree i20 h1 submit_tree i22 h3 -submit_tree i19 bvic +submit_tree i18 bvic diff --git a/proj/weekly-tree/submit-memvar b/proj/weekly-tree/submit-memvar new file mode 100755 index 0000000..1a2c593 --- /dev/null +++ b/proj/weekly-tree/submit-memvar @@ -0,0 +1,18 @@ +#! /usr/bin/env bash +set -o errexit -o errtrace -o pipefail -o nounset # -o xtrace + +submit_tree() +{ + NODE="$1" + SUBTYPE="$2" + ( + ${AE_ROOT}/proj/weekly-tree/raxml --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/raxml-asr --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/garli-memvar --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/pdf ${SUBTYPE} + ) >${SUBTYPE}/submit.log 2>&1 & +} + +# submit_tree i21 h1 +# submit_tree i22 h3 +submit_tree i22 bvic diff --git a/proj/weekly-tree/submit-new b/proj/weekly-tree/submit-new new file mode 100755 index 0000000..69dd730 --- /dev/null +++ b/proj/weekly-tree/submit-new @@ -0,0 +1,18 @@ +#! /usr/bin/env bash +set -o errexit -o errtrace -o pipefail -o nounset # -o xtrace + +submit_tree() +{ + NODE="$1" + SUBTYPE="$2" + ( + ${AE_ROOT}/proj/weekly-tree/raxml --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/raxml-asr --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/garli-h3 --node ${NODE} ${SUBTYPE} && + ${AE_ROOT}/proj/weekly-tree/pdf ${SUBTYPE} + ) >${SUBTYPE}/submit.log 2>&1 & +} + +submit_tree i20 h1 #i21 +submit_tree i22 h3 +submit_tree i19 bvic