From 85704899dd1b2625a96e388d9089b28db9fa09e8 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 13 Jan 2026 15:57:02 +0000 Subject: [PATCH 01/31] Update Steell2025 reference --- R/Consistency.R | 10 +++++----- inst/REFERENCES.bib | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/Consistency.R b/R/Consistency.R index bd695f758..2aeb4adbc 100644 --- a/R/Consistency.R +++ b/R/Consistency.R @@ -1,4 +1,4 @@ -#' Consistency / retention "indices" +#' Consistency and retention "indices" #' #' `Consistency()` calculates the consistency "index" and retention index #' \insertCite{Farris1989}{TreeSearch} @@ -19,7 +19,7 @@ #' Note that as the possible values of the consistency index do not range from #' zero to one, it is not an index in the mathematical sense of the term. #' Shortcomings of this measure are widely documented -#' \insertCite{Archie1989,Brooks1986,Steell2023}{TreeSearch}. +#' \insertCite{Archie1989,Brooks1986,Steell2025}{TreeSearch}. #' #' The maximum length of a character (see [`MaximumLength()`]) is the #' number of steps in a parsimonious reconstruction on the longest possible tree @@ -35,7 +35,7 @@ #' possible values runs from zero (least consistent) to one #' (perfectly consistent). #' -#' The **relative homoplasy index** \insertCite{Steell2023}{TreeSearch} is +#' The **relative homoplasy index** \insertCite{Steell2025}{TreeSearch} is #' the ratio of the observed excess tree length to the excess tree length #' due to chance, taken as the median score of a character when the leaves #' of the given tree are randomly shuffled. @@ -45,8 +45,8 @@ #' default treatment in [`TreeLength()`]. #' #' @param nRelabel Integer specifying how many times to relabel leaves when -#' estimating null tree length for \acronym{RHI} calculation. -#' \insertCite{Steell2023;textual}{TreeSearch} recommend 1000, but suggest that +#' computing MCMC estimate of null tree length for \acronym{RHI} calculation. +#' \insertCite{Steell2025;textual}{TreeSearch} recommend 1000, but suggest that #' 100 may suffice. #' If zero (the default), the \acronym{RHI} is not calculated. #' @inheritParams CharacterLength diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index d82e34841..52b216f40 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -576,7 +576,7 @@ @article{Steel1996 number = {4} } -@article{Steell2023, +@article{Steell2025, title = {Revealing Patterns of Homoplasy in Discrete Phylogenetic Datasets with a Cross-Comparable Index}, author = {Steell, Elizabeth M. and Hsiang, Allison Y. and Field, Daniel J.}, year = {2025}, From 377c0a2245823d066c487b5df6af5690ec3229a3 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:01:09 +0000 Subject: [PATCH 02/31] Quartet concordance implementation --- NEWS.md | 3 + R/Concordance.R | 120 +++++++++++++++++++++--------------- R/RcppExports.R | 4 ++ src/RcppExports.cpp | 12 ++++ src/TreeSearch-init.c | 6 ++ src/quartet_concordance.cpp | 103 +++++++++++++++++++++++++++++++ 6 files changed, 199 insertions(+), 49 deletions(-) create mode 100644 src/quartet_concordance.cpp diff --git a/NEWS.md b/NEWS.md index a08a957e6..85fbbf65c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +- `QuartetConcordance()` gains `return` parameter and fast C++ implementation. + + # TreeSearch 1.7.0.9000 (development) - Fix regression in `MaximumLength()`. diff --git a/R/Concordance.R b/R/Concordance.R index c5c76641b..260a14b56 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -27,6 +27,9 @@ #' other implementations (e.g. IQ-TREE) follow #' \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample #' of quartets for a faster, if potentially less accurate, computation. +#TODO +#' `QuartetConcordance()` in principle supports ambiguous character states, +#' but this has not yet been tested. #' # `ClusteringConcordance()` and `PhylogeneticConcordance()` respectively report # the proportion of clustering information and phylogenetic information @@ -56,7 +59,7 @@ #' @examples #' data("congreveLamsdellMatrices", package = "TreeSearch") #' dataset <- congreveLamsdellMatrices[[1]][, 1:20] -#' tree <- referenceTree +#' tree <- TreeSearch::referenceTree #' qc <- QuartetConcordance(tree, dataset) #' cc <- ClusteringConcordance(tree, dataset) #' pc <- PhylogeneticConcordance(tree, dataset) @@ -90,7 +93,8 @@ #' @name SiteConcordance #' @family split support functions #' @export -QuartetConcordance <- function (tree, dataset = NULL, weight = TRUE) { +QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, + return = "mean") { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) @@ -109,57 +113,75 @@ QuartetConcordance <- function (tree, dataset = NULL, weight = TRUE) { logical(NTip(dataset))) characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) + charLevels <- attr(dataset, "allLevels") + charLevels[rowSums(attr(dataset, "contrast")) > 1] <- NA + charLevels <- setdiff(charLevels, "-") - cli_progress_bar(name = "Quartet concordance", total = dim(logiSplits)[[2]]) - setNames(apply(logiSplits, 2, function (split) { - cli_progress_update(1, .envir = parent.frame(2)) - quarts <- apply(characters, 2, function (char) { - tab <- table(split, char) - nCol <- dim(tab)[[2]] - if (nCol > 1L) { - # Consider the case - # split 0 1 2 - # FALSE 2 3 0 - # TRUE 0 4 2 - # - # Concordant quartets with bin i = 1 (character 0) and bin j = 2 - # (character 1) include any combination of the two taxa from - # [FALSE, 0], and the two taxa chosen from the four in [TRUE, 1], - # i.e. choose(2, 2) * choose(4, 2) - concordant <- sum(vapply(seq_len(nCol), function (i) { - inBinI <- tab[1, i] - iChoices <- choose(inBinI, 2) - sum(vapply(seq_len(nCol)[-i], function (j) { - inBinJ <- tab[2, j] - iChoices * choose(inBinJ, 2) - }, 1)) - }, 1)) - - # To be discordant, we must select a pair of taxa from TT and from FF; - # and the character states must group each T with an F - # e.g. T0 T1 F0 F1 - # T0 T1 F0 F2 would not be discordant - just uninformative - discordant <- sum(apply(combn(nCol, 2), 2, function (ij) prod(tab[, ij]))) - - # Only quartets that include two T and two F can be decisive - # Quartets must also include two pairs of characters - decisive <- concordant + discordant - - # Return the numerator and denominatory of equation 2 in - # Minh et al. 2020 - c(concordant, decisive) - } else { - c(0L, 0L) - } - }) + charInt <- matrix(match(characters, charLevels), + nrow = nrow(characters), + dimnames = dimnames(characters)) + raw_counts <- quartet_concordance(logiSplits, charInt) + + num <- raw_counts$concordant + den <- raw_counts$decisive + + options <- c("char", "default") + return <- options[[pmatch(tolower(trimws(return)), options, + nomatch = length(options))]] + + if (return == "default") { + if (isTRUE(weight)) { + # Sum numerator and denominator across sites (columns), then divide + # This matches weighted.mean(num/den, den) == sum(num) / sum(den) + split_sums_num <- rowSums(num) + split_sums_den <- rowSums(den) + ret <- ifelse(split_sums_den == 0, NA_real_, split_sums_num / split_sums_den) + } else { + # Mean of ratios per site + # Avoid division by zero (0/0 -> NaN -> NA handled by na.rm) + ratios <- num / den + # Replace NaN/Inf with NA for rowMeans calculation + ratios[!is.finite(ratios)] <- NA + ret <- rowMeans(ratios, na.rm = TRUE) + } + + setNames(ret, names(splits)) + } else { + p <- num / den if (isTRUE(weight)) { - quartSums <- rowSums(quarts) - ifelse(is.nan(quartSums[[2]]), NA_real_, quartSums[[1]] / quartSums[[2]]) + vapply(seq_len(dim(num)[[2]]), function(i) { + weighted.mean(num[, i] / den[, i], den[, i]) + }, double(1)) } else { - mean(ifelse(is.nan(quarts[2, ]), NA_real_, quarts[1, ] / quarts[2, ]), - na.rm = TRUE) + vapply(seq_len(dim(num)[[2]]), function(i) { + mean(num[den[, i] > 0, i] / den[den[, i] > 0, i]) + }, double(1)) } - }), names(splits)) + } +} + +#' @importFrom fastmap fastmap +.ExpectedMICache <- fastmap() + +# @param a must be a vector of length <= 2 +# @param b may be longer +#' @importFrom base64enc base64encode +.ExpectedMI <- function(a, b) { + if (length(a) < 2 || length(b) < 2) { + 0 + } else { + key <- base64enc::base64encode(mi_key(a, b)) + if (.ExpectedMICache$has(key)) { + .ExpectedMICache$get(key) + } else { + ret <- expected_mi(a, b) + + # Cache: + .ExpectedMICache$set(key, ret) + # Return: + ret + } + } } #' @importFrom TreeDist Entropy diff --git a/R/RcppExports.R b/R/RcppExports.R index 33cb4a90f..ace4dfa10 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -17,6 +17,10 @@ morphy_profile <- function(edge, MorphyHandls, weight, sequence, profiles, targe .Call(`_TreeSearch_morphy_profile`, edge, MorphyHandls, weight, sequence, profiles, target) } +quartet_concordance <- function(splits, characters) { + .Call(`_TreeSearch_quartet_concordance`, splits, characters) +} + nni <- function(edge, randomEdge, whichSwitch) { .Call(`_TreeSearch_nni`, edge, randomEdge, whichSwitch) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index bbb5c9868..120c10049 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -67,6 +67,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// quartet_concordance +List quartet_concordance(const LogicalMatrix splits, const IntegerMatrix characters); +RcppExport SEXP _TreeSearch_quartet_concordance(SEXP splitsSEXP, SEXP charactersSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const LogicalMatrix >::type splits(splitsSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix >::type characters(charactersSEXP); + rcpp_result_gen = Rcpp::wrap(quartet_concordance(splits, characters)); + return rcpp_result_gen; +END_RCPP +} // nni IntegerMatrix nni(const IntegerMatrix edge, const IntegerVector randomEdge, const IntegerVector whichSwitch); RcppExport SEXP _TreeSearch_nni(SEXP edgeSEXP, SEXP randomEdgeSEXP, SEXP whichSwitchSEXP) { diff --git a/src/TreeSearch-init.c b/src/TreeSearch-init.c index 729dc2a76..014e25d71 100644 --- a/src/TreeSearch-init.c +++ b/src/TreeSearch-init.c @@ -23,6 +23,8 @@ extern SEXP _TreeSearch_preorder_morphy_by_char(SEXP, SEXP); extern SEXP _TreeSearch_morphy_iw(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP _TreeSearch_morphy_profile(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _TreeSearch_quartet_concordance(SEXP, SEXP); + static const R_CallMethodDef callMethods[] = { {"_R_wrap_mpl_new_Morphy", (DL_FUNC) &_R_wrap_mpl_new_Morphy, 0}, {"_R_wrap_mpl_delete_Morphy", (DL_FUNC) &_R_wrap_mpl_delete_Morphy, 1}, @@ -55,8 +57,12 @@ static const R_CallMethodDef callMethods[] = { // {"_TreeSearch_tbr_moves", (DL_FUNC) &_TreeSearch_tbr_moves, 1}, {"_TreeSearch_preorder_morphy", (DL_FUNC) &_TreeSearch_preorder_morphy, 2}, {"_TreeSearch_preorder_morphy_by_char", (DL_FUNC) &_TreeSearch_preorder_morphy_by_char, 2}, + {"_TreeSearch_morphy_iw", (DL_FUNC) &_TreeSearch_morphy_iw, 7}, {"_TreeSearch_morphy_profile", (DL_FUNC) &_TreeSearch_morphy_profile, 6}, + + {"_TreeSearch_quartet_concordance",(DL_FUNC) &_TreeSearch_quartet_concordance, 2}, + {"MORPHYLENGTH", (DL_FUNC) &MORPHYLENGTH, 4}, {"RANDOM_TREE", (DL_FUNC) &RANDOM_TREE, 1}, {"RANDOM_TREE_SCORE", (DL_FUNC) &RANDOM_TREE_SCORE, 2}, diff --git a/src/quartet_concordance.cpp b/src/quartet_concordance.cpp new file mode 100644 index 000000000..32be65b79 --- /dev/null +++ b/src/quartet_concordance.cpp @@ -0,0 +1,103 @@ +#include +#include +#include + +using namespace Rcpp; + +// [[Rcpp::export]] +List quartet_concordance(const LogicalMatrix splits, const IntegerMatrix characters) { + int n_splits = splits.ncol(); + int n_chars = characters.ncol(); + int n_taxa = splits.nrow(); + + // Storage for results + // Using NumericMatrix to safely hold potentially large counts (double precision) + NumericMatrix concordant(n_splits, n_chars); + NumericMatrix decisive(n_splits, n_chars); + + // Loop over every Split (s) and every Character (c) + for (int s = 0; s < n_splits; ++s) { + for (int c = 0; c < n_chars; ++c) { + + // 1. Build Contingency Table manually + // Map: Character State (int) -> pair + // We use a map to handle arbitrary state integers (0, 1, 2, or 10, 20...). + std::map> counts; + + for (int t = 0; t < n_taxa; ++t) { + int char_state = characters(t, c); + + // Skip NAs (ambiguous characters) + if (IntegerVector::is_na(char_state)) continue; + + // Check split membership (FALSE=0, TRUE=1) + if (splits(t, s)) { + counts[char_state].second++; + } else { + counts[char_state].first++; + } + } + + // If fewer than 2 states are present, no quartets can be decisive + if (counts.size() < 2) { + concordant(s, c) = 0; + decisive(s, c) = 0; + continue; + } + + // 2. Calculate Combinatorics + // We need to sum over combinations without O(N^2) loops. + + // Variables for Concordance: sum( choose(n0i, 2) * sum(choose(n1j, 2)) ) where j != i + // This is equivalent to: sum( choose(n0i, 2) * (Total_Choose_N1 - choose(n1i, 2)) ) + double total_choose_n1 = 0; + std::vector choose_n0_vec; + std::vector choose_n1_vec; + choose_n0_vec.reserve(counts.size()); + choose_n1_vec.reserve(counts.size()); + + // Variables for Discordance: sum( n0i*n1i * n0j*n1j ) for i < j + // This is equivalent to: 0.5 * ( (sum P)^2 - sum(P^2) ) where P_k = n0k * n1k + double sum_prod = 0; + double sum_sq_prod = 0; + + for (auto const& [state, cnt] : counts) { + double n0 = cnt.first; + double n1 = cnt.second; + + // Concordance Pre-calcs + double cn0 = n0 * (n0 - 1) / 2.0; + double cn1 = n1 * (n1 - 1) / 2.0; + + choose_n0_vec.push_back(cn0); + choose_n1_vec.push_back(cn1); + total_choose_n1 += cn1; + + // Discordance Pre-calcs + double p = n0 * n1; + sum_prod += p; + sum_sq_prod += p * p; + } + + // Finalize Concordance + double conc_val = 0; + for (size_t k = 0; k < choose_n0_vec.size(); ++k) { + // Pairs of (0,0) from state K and (1,1) from any OTHER state + conc_val += choose_n0_vec[k] * (total_choose_n1 - choose_n1_vec[k]); + } + + // Finalize Discordance + double disc_val = 0.5 * (sum_prod * sum_prod - sum_sq_prod); + + // Store results + concordant(s, c) = conc_val; + // Decisive = Concordant + Discordant + decisive(s, c) = conc_val + disc_val; + } + } + + return List::create( + _["concordant"] = concordant, + _["decisive"] = decisive + ); +} From 7a12f60958f0fe0f921aac22c646e48f06d964fc Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:04:10 +0000 Subject: [PATCH 03/31] MatchStrings in PhyloConc --- NAMESPACE | 1 + R/Concordance.R | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 79e40d690..416d3edae 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -159,6 +159,7 @@ importFrom(TreeTools,Log2Unrooted) importFrom(TreeTools,Log2UnrootedMult) importFrom(TreeTools,LogDoubleFactorial) importFrom(TreeTools,MakeTreeBinary) +importFrom(TreeTools,MatchStrings) importFrom(TreeTools,MatrixToPhyDat) importFrom(TreeTools,NRooted) importFrom(TreeTools,NSplits) diff --git a/R/Concordance.R b/R/Concordance.R index 260a14b56..1ee2e8582 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -237,14 +237,15 @@ ClusteringConcordance <- function (tree, dataset) { } #' @rdname SiteConcordance -#' @importFrom TreeTools as.multiPhylo CladisticInfo CompatibleSplits +#' @importFrom TreeTools as.multiPhylo CladisticInfo CompatibleSplits +#' MatchStrings #' @export PhylogeneticConcordance <- function (tree, dataset) { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) } - dataset <- dataset[TipLabels(tree)] + dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] splits <- as.Splits(tree) if (is.null(names(splits))) { names(splits) <- paste0("sp", seq_along(splits)) From 9a771703900b3a0acb4665673521466721313c30 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:20:03 +0000 Subject: [PATCH 04/31] SharedPhylogeneticConcordance --- man/SiteConcordance.Rd | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 9a27ac8ad..72fb08f13 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -9,7 +9,7 @@ \alias{SharedPhylogeneticConcordance} \title{Calculate site concordance factor} \usage{ -QuartetConcordance(tree, dataset = NULL, weight = TRUE) +QuartetConcordance(tree, dataset = NULL, weight = TRUE, return = "mean") ClusteringConcordance(tree, dataset) @@ -60,10 +60,18 @@ If \code{weight = FALSE}, the split concordance will be mean(75\%, 25\%) = 50\%. other implementations (e.g. IQ-TREE) follow \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample of quartets for a faster, if potentially less accurate, computation. +\code{QuartetConcordance()} in principle supports ambiguous character states, +but this has not yet been tested. \strong{NOTE:} These functions are under development. They are incompletely tested, and may change without notice. Complete documentation and discussion will follow in due course. +\code{SharedPhylogeneticConcordance()} treats each character as a simple tree. +Each token in the character corresponds to a node whose pendant edges are the +taxa with that token. +The Shared Phylogenetic Concordance for each character in \code{dataset} is then +the Shared Phylogenetic Information \insertCite{Smith2020}{TreeSearch} of +this tree and \code{tree}. } \examples{ data("congreveLamsdellMatrices", package = "TreeSearch") From fdad67ac845db4a7b59deaabab28a2ad846f15a3 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:22:24 +0000 Subject: [PATCH 05/31] expected_mi --- R/RcppExports.R | 8 +++ src/RcppExports.cpp | 24 ++++++++ src/TreeSearch-init.c | 5 ++ src/expected_mi.cpp | 128 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 165 insertions(+) create mode 100644 src/expected_mi.cpp diff --git a/R/RcppExports.R b/R/RcppExports.R index ace4dfa10..a2bc7abe5 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,14 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +expected_mi <- function(ni, nj) { + .Call(`_TreeSearch_expected_mi`, ni, nj) +} + +mi_key <- function(ni, nj) { + .Call(`_TreeSearch_mi_key`, ni, nj) +} + preorder_morphy <- function(edge, MorphyHandl) { .Call(`_TreeSearch_preorder_morphy`, edge, MorphyHandl) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 120c10049..d5708bd8c 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -10,6 +10,30 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif +// expected_mi +double expected_mi(const IntegerVector& ni, const IntegerVector& nj); +RcppExport SEXP _TreeSearch_expected_mi(SEXP niSEXP, SEXP njSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const IntegerVector& >::type ni(niSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nj(njSEXP); + rcpp_result_gen = Rcpp::wrap(expected_mi(ni, nj)); + return rcpp_result_gen; +END_RCPP +} +// mi_key +RawVector mi_key(IntegerVector ni, IntegerVector nj); +RcppExport SEXP _TreeSearch_mi_key(SEXP niSEXP, SEXP njSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerVector >::type ni(niSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type nj(njSEXP); + rcpp_result_gen = Rcpp::wrap(mi_key(ni, nj)); + return rcpp_result_gen; +END_RCPP +} // preorder_morphy int preorder_morphy(IntegerMatrix edge, SEXP MorphyHandl); RcppExport SEXP _TreeSearch_preorder_morphy(SEXP edgeSEXP, SEXP MorphyHandlSEXP) { diff --git a/src/TreeSearch-init.c b/src/TreeSearch-init.c index 014e25d71..f054c9aa6 100644 --- a/src/TreeSearch-init.c +++ b/src/TreeSearch-init.c @@ -23,6 +23,9 @@ extern SEXP _TreeSearch_preorder_morphy_by_char(SEXP, SEXP); extern SEXP _TreeSearch_morphy_iw(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP _TreeSearch_morphy_profile(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _TreeSearch_expected_mi(SEXP, SEXP); +extern SEXP _TreeSearch_mi_key(SEXP, SEXP); + extern SEXP _TreeSearch_quartet_concordance(SEXP, SEXP); static const R_CallMethodDef callMethods[] = { @@ -60,6 +63,8 @@ static const R_CallMethodDef callMethods[] = { {"_TreeSearch_morphy_iw", (DL_FUNC) &_TreeSearch_morphy_iw, 7}, {"_TreeSearch_morphy_profile", (DL_FUNC) &_TreeSearch_morphy_profile, 6}, + {"_TreeSearch_expected_mi", (DL_FUNC) &_TreeSearch_expected_mi, 2}, + {"_TreeSearch_mi_key", (DL_FUNC) &_TreeSearch_mi_key, 2}, {"_TreeSearch_quartet_concordance",(DL_FUNC) &_TreeSearch_quartet_concordance, 2}, diff --git a/src/expected_mi.cpp b/src/expected_mi.cpp new file mode 100644 index 000000000..867b97b23 --- /dev/null +++ b/src/expected_mi.cpp @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include +#include +using namespace Rcpp; + +#define MAX_FACTORIAL_LOOKUP 8192 +static double log2_factorial_table[MAX_FACTORIAL_LOOKUP + 1]; +static const double LOG2_E = 1.4426950408889634; + +__attribute__((constructor)) +void initialize_factorial_cache() { + log2_factorial_table[0] = 0.0; + for (int i = 1; i <= MAX_FACTORIAL_LOOKUP; i++) { + log2_factorial_table[i] = log2_factorial_table[i - 1] + std::log2(i); + } +} + +// Fast lookup with bounds checking +inline double l2factorial(int n) { + if (n <= MAX_FACTORIAL_LOOKUP) { + return log2_factorial_table[n]; + } else { + return lgamma(n + 1) * LOG2_E; + } +} + +// ni and nj are vectors listing the number of entitites in each cluster +// [[Rcpp::export]] +double expected_mi(const IntegerVector &ni, const IntegerVector &nj) { + // ni = {a, N-a}; nj = counts of character states + const int a = ni[0]; + const int N = ni[0] + ni[1]; + if (a <= 0 || a >= N) return 0.0; // trivial split + + const double invN = 1.0 / static_cast(N); + const double log2N = std::log2(static_cast(N)); + const double log2a = std::log2(static_cast(a)); + const double log2Na = std::log2(static_cast(N - a)); + const double log2_denom = l2factorial(N) - l2factorial(a) - l2factorial(N - a); + + double emi = 0.0; + + for (int j = 0; j < nj.size(); ++j) { + int mj = nj[j]; + if (mj <= 0) continue; + + int kmin = std::max(0, a + mj - N); + int kmax = std::min(a, mj); + if (kmin > kmax) continue; + + const double log2mj = std::log2(static_cast(mj)); + + // compute P(K=kmin) + double log2P = (l2factorial(mj) - l2factorial(kmin) - l2factorial(mj - kmin)) + + (l2factorial(N - mj) - l2factorial(a - kmin) - l2factorial(N - mj - (a - kmin))) + - log2_denom; + double Pk = std::pow(2.0, log2P); + + for (int k = kmin; k <= kmax; ++k) { + if (Pk > 0.0) { + // contribution from inside the split + if (k > 0) { + double mi_in = std::log2(static_cast(k)) + log2N - (log2a + log2mj); + emi += (static_cast(k) * invN) * mi_in * Pk; + } + // contribution from outside the split + int kout = mj - k; + if (kout > 0) { + double mi_out = std::log2(static_cast(kout)) + log2N - (log2Na + log2mj); + emi += (static_cast(kout) * invN) * mi_out * Pk; + } + } + // Update P(k) → P(k+1) + if (k < kmax) { + double numer = static_cast((mj - k) * (a - k)); + double denom = static_cast((k + 1) * (N - mj - a + k + 1)); + Pk *= numer / denom; + } + } + } + + return emi; +} + +// [[Rcpp::export]] +RawVector mi_key(IntegerVector ni, IntegerVector nj) { + if (ni.size() != 2) { + Rcpp::stop("ni must be a vector of length 2."); + } + + std::vector ni_vals = {static_cast(ni[0]), + static_cast(ni[1])}; + if (ni_vals[0] > 65535 || ni_vals[1] > 65535) { + Rcpp::stop("ni values must be ≤ 65535."); + } + std::sort(ni_vals.begin(), ni_vals.end()); + + std::vector nj_vals; + for (int val : nj) { + if (val > 65535) { + Rcpp::stop("nj values must be ≤ 65535."); + } + nj_vals.push_back(static_cast(val)); + } + std::sort(nj_vals.begin(), nj_vals.end()); + + + // Total number of 16-bit ints + size_t n = 2 + nj_vals.size(); + RawVector key_raw(n * 2); + + // Write ni + key_raw[0] = ni_vals[0] >> 8; + key_raw[1] = ni_vals[0] & 0xFF; + key_raw[2] = ni_vals[1] >> 8; + key_raw[3] = ni_vals[1] & 0xFF; + + // Write nj + for (size_t i = 0; i < nj_vals.size(); ++i) { + key_raw[4 + i + i] = nj_vals[i] >> 8; + key_raw[4 + i + i + 1] = nj_vals[i] & 0xFF; + } + + return key_raw; +} From 5464ee9339539aeacb0c527c9269eb81e43e92e4 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:22:53 +0000 Subject: [PATCH 06/31] ClusteringConcordance update --- R/Concordance.R | 259 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 240 insertions(+), 19 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 1ee2e8582..f42590539 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -190,15 +190,103 @@ QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, } #' @rdname SiteConcordance -#' @importFrom TreeTools Subsplit +#' @param return Character specifying what to return. +#' - `"mean"` returns the mean concordance index at each split across all sites. +#' - `"all"` returns all values calculated during the working for each site at +#' each split. +#' @param normalize Logical or numeric; if `TRUE` the mutual information will be +#' normalized such that zero corresponds to the expected mutual information of +#' a randomly drawn character with the same distribution of tokens. +#' If `FALSE`, zero will correspond to zero mutual information, +#' even if this is not possible to accomplish in practice. +#' The exact analytical solution, whilst quick, does not account for +#' non-independence between splits. This is a less important factor for larger +#' trees, and is negligible above ~200 leaves. For small trees, the expected +#' value for random trees can be estimated by resampling relabelled trees. +#' To conduct _n_ resamplings, set `normalize = n`. +#' +#' For `return = "char"`, `"tree"`, values will be normalized such that 1 +#' corresponds to the maximum possible value, and 0 to the expected value. +#' If `normalize = TRUE`, this will be the expected value for a random +#' character on the given tree. If `normalize` is numeric, the expected value +#' will be estimated by fitting the character to `n` uniformly random trees. +#' +#' @returns +#' `ClusteringConcordance(return = "all")` returns a 3D array where each +#' slice corresponds to a character (site), each column to a tree split, and +#' each row to a different information measure. The `normalized` row gives the +#' normalized mutual information between each split-character pair, scaled so +#' that 1.0 corresponds to `hBest` (the theoretical maximum mutual information, +#' being the minimum of `hSplit` and `hChar`) and 0.0 corresponds to `miRand` +#' (the expected mutual information under random association). `hSplit` gives +#' the entropy (information content) of each split's bipartition; `hChar` gives +#' the entropy of each character's state distribution; `hJoint` gives the joint +#' entropy of the split-character confusion matrix; `mi` gives the raw mutual +#' information; and `n` records the number of informative observations. +#' Negative normalized values indicate observed mutual information below random +#' expectation. `NA` is returned when `hBest = 0` (no information potential). +#' +#' `ClusteringConcordance(return = "edge")` returns a vector where each element +#' corresponds to a split (an edge of the tree) and gives the normalized mutual +#' information between that split and the character data, averaged across all +#' characters. +#' When `normalize = TRUE` (default), values are scaled relative to random +#' expectation; when `FALSE`, raw mutual information normalized by `hBest` is +#' returned. +#' +#' `ClusteringConcordance(return = "char")` returns a vector where each element +#' corresponds to a character (site) and gives the entropy-weighted average +#' normalized mutual information between that character and all tree splits. +#' Characters with higher information content receive proportionally more weight +#' from splits that can potentially convey more information about them. +#' +#' `ClusteringConcordance(return = "tree")` returns a single value representing +#' the overall concordance between the tree topology and the character data. +#' This averages the fit of the best-matching split for each character. +#' This is probably biased. I have not identified a circumstance in which it +#' produces meaningful results. Let me know if you find one. +#' +#' I had previously considered calculating +#' the entropy-weighted average of normalized mutual +#' information across all split-character pairs, where each pair contributes +#' proportionally to its potential information content. +#' The problem here is that imperfect matches between compatible splits +#' come to dominate, resulting in a small score that gets smaller as trees get +#' larger, even with a perfect fit. +#' +#' +#' @seealso +#' - [Consistency()] +#' @examples +#' data(congreveLamsdellMatrices) +#' myMatrix <- congreveLamsdellMatrices[[10]] +#' ClusteringConcordance(TreeTools::NJTree(myMatrix), myMatrix) +#' @template MRS +#' @importFrom abind abind #' @importFrom stats setNames +#' @importFrom TreeDist ClusteringEntropy Entropy entropy_int +#' MutualClusteringInfo +#' @importFrom TreeTools as.Splits MatchStrings Subsplit TipLabels #' @export -ClusteringConcordance <- function (tree, dataset) { +ClusteringConcordance <- function (tree, dataset, return = "edge", + normalize = TRUE) { + # Check inputs if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) } - dataset <- dataset[TipLabels(tree)] + if (is.null(tree)) { + warning("Cannot calculate concordance without `dataset`.") + return(NULL) + } + + keep <- MatchStrings(TipLabels(tree), names(dataset), warning) + if (length(keep) == 0) { + return(NULL) + } + dataset <- dataset[keep] + + # Prepare data splits <- as.logical(as.Splits(tree)) at <- attributes(dataset) @@ -208,32 +296,165 @@ ClusteringConcordance <- function (tree, dataset) { } ambiguous <- rowSums(cont) != 1 - mat <- matrix(unlist(dataset), length(dataset), byrow = TRUE) - mat[mat %in% which(ambiguous)] <- NA + mat <- matrix(as.integer(unlist(dataset)), length(dataset), byrow = TRUE) + mat[mat %in% which(ambiguous)] <- NA_integer_ + maxToken <- max(mat, na.rm = TRUE) + tokens <- as.character(seq_len(maxToken)) mat <- apply(mat, 2, function (x) { - uniques <- table(x) == 1 - x[x %in% names(uniques[uniques])] <- NA + uniques <- tabulate(x, maxToken) == 1 + x[x %in% tokens[uniques]] <- NA_integer_ x }) - - h <- apply(mat, 2, function (char) { + + # Calculate entropy + h <- simplify2array(apply(mat, 2, function(char) { aChar <- !is.na(char) ch <- char[aChar] - hChar <- .Entropy(table(ch)) - h <- apply(splits[, aChar], 1, function (spl) { - c(hSpl = .Entropy(table(spl)), hJoint = .Entropy(table(ch, spl))) + if (length(ch) == 0) { + # All ambiguous + n <- 0 + hChar <- 0 + } else { + chMax <- max(1, ch) + chTable <- tabulate(ch, chMax) + n <- length(ch) + hChar <- entropy_int(chTable) + } + + hh <- apply(splits[, aChar, drop = FALSE], 1, function (spl) { + spTable <- tabulate(spl + 1, 2) + if (any(spTable < 2)) { + c(hSplit = 0, + hJoint = hChar, + miRand = 0, + n = n) + } else { + c(hSplit = entropy_int(spTable), + hJoint = entropy_int(tabulate(ch + (spl * chMax), chMax + chMax)), + miRand = .ExpectedMI(spTable, chTable), + n = n) + } }) - cbind(hSum = hChar + h["hSpl", ], joint = h["hJoint", ]) - }) + rbind(hChar = hChar, hh) + }, simplify = FALSE)) + + if (length(dim(h)) == 2) { + # Matrix to 3D array + dim(h) <- c(dim(h), 1) + } - splitI <- seq_len(dim(splits)[1]) - both <- rowSums(h[splitI, at[["index"]], drop = FALSE]) - joint <- rowSums(h[-splitI, at[["index"]], drop = FALSE]) - mi <- both - joint + h[abs(h) < sqrt(.Machine$double.eps)] <- 0 + hh <- h[, , at[["index"]], drop = FALSE] + + hBest <- `rownames<-`(pmin(hh["hChar", , , drop = FALSE], + hh["hSplit", , , drop = FALSE]), NULL) + mi <- `rownames<-`(hh["hChar", , , drop = FALSE] + + hh["hSplit", , , drop = FALSE] - + hh["hJoint", , , drop = FALSE], NULL) + miRand <- `rownames<-`(hh["miRand", , , drop = FALSE], NULL) + norm <- if(isFALSE(normalize)) { + ifelse(hBest == 0, NA, mi / hBest) + } else { + ifelse(hBest == 0, NA, .Rezero(mi / hBest, miRand / hBest)) + } + + returnType <- pmatch(tolower(return), c("all", "edge", "character", "tree"), + nomatch = 1L) + if (returnType %in% 3:4) { # character / tree + charSplits <- apply(mat, 2, simplify = FALSE, function(x) + as.Splits(x[!is.na(x)], tipLabels = keep[!is.na(x)])) + charMax <- vapply(charSplits, ClusteringEntropy, double(1))[ + attr(dataset, "index")] + charInfo <- MutualClusteringInfo(tree, charSplits)[at[["index"]]] + if (is.numeric(normalize)) { + rTrees <- replicate(normalize, RandomTree(tree), simplify = FALSE) + randInfo <- MutualClusteringInfo(rTrees, charSplits)[, attr(dataset, "index")] + randMean <- colMeans(randInfo) + var <- rowSums((t(randInfo) - randMean) ^ 2) / (normalize - 1) + mcse <- sqrt(var / normalize) + randTreeInfo <- rowSums(randInfo) + randTreeMean <- mean(randTreeInfo) + treeVar <- var(randTreeInfo) + mcseTree <- sqrt(treeVar / normalize) + } + } # Return: - setNames(mi / joint, rownames(splits)) + switch(returnType, + # all + abind( + along = 1, + normalized = norm, + hh, + hBest = hBest, + mi = mi, + miRand = miRand + ), { + # edge + best <- rowSums(hBest[1, , , drop = FALSE], dims = 2) + ifelse(!is.na(best) & best == 0, + NA_real_, + if (isTRUE(normalize)) { + .Rezero( + rowSums(mi[1, , , drop = FALSE], dims = 2) / best, + rowSums(miRand[1, , , drop = FALSE], dims = 2) / best + ) + } else { + rowSums(mi[1, , , drop = FALSE], dims = 2) / best + })[1, ] + }, { + # char + + # one <- hh["hChar", 1, , drop = TRUE] # All rows equal + one <- charMax + zero <- if (isFALSE(normalize)) { + 0 + } else if (isTRUE(normalize)) { + apply(hh["miRand", , ], 2, max) + } else { + randMean + } + ret <- (charInfo - zero) / (one - zero) + if (is.numeric(normalize)) { + mcseInfo <- ((one - charInfo) / (one - zero) ^ 2) * mcse + mcseInfo[mcseInfo < sqrt(.Machine$double.eps)] <- 0 + structure(ret, hMax = charMax, mcse = mcseInfo) + } else { + structure(charInfo / charMax, hMax = charMax) + } + }, { + # tree: Entropy-weighted mean across best character-split pairs + warning("I'm not aware of a situation in which this is a useful measure.") + # Random normalization doesn't work if we pick the best matching + # split; one will be randomly better than another, even in the + # absence of signal. + norm[is.na(norm)] <- 0 + bestMatch <- apply(norm, 3, which.max) + idx <- cbind(1, bestMatch, seq_along(bestMatch)) + return(weighted.mean(norm[idx], hBest[idx])) + + if (isFALSE(normalize)) { + } else { + one <- sum(hh["hChar", 1, ]) + zero <- if (isTRUE(normalize)) { + sum(apply(hh["miRand", , ], 2, max)) + } else { + randTreeMean + } + ret <- (sum(charInfo) - zero) / (one - zero) + if (is.numeric(normalize)) { + mcseInfo <- ((one - sum(charInfo)) / (one - zero) ^ 2) * mcseTree + mcseInfo[mcseInfo < sqrt(.Machine$double.eps)] <- 0 + structure(ret, mcse = mcseInfo) + } else { + ret + } + } + } + ) +} + } #' @rdname SiteConcordance From 9a44f43eea2bfc1a6c271b9be83081641657e674 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:24:26 +0000 Subject: [PATCH 07/31] QA visualization & tests --- NAMESPACE | 4 + NEWS.md | 4 + R/Concordance.R | 169 +++++++++++++++++++++++++++++- tests/testthat/test-Concordance.R | 70 ++++++++++++- 4 files changed, 242 insertions(+), 5 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 416d3edae..0958569b9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ export(Carter1) export(CharacterLength) export(ClusterStrings) export(ClusteringConcordance) +export(ConcordanceTable) export(ConcordantInfo) export(ConcordantInformation) export(Consistency) @@ -68,6 +69,9 @@ export(PolEscapa) export(PrepareDataIW) export(PrepareDataProfile) export(PresCont) +export(QACol) +export(QALegend) +export(QCol) export(QuartetConcordance) export(QuartetResolution) export(RandomMorphyTree) diff --git a/NEWS.md b/NEWS.md index 85fbbf65c..8ac92f6b1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# Branch `char-concord` (development) + +- Implements the methods of Smith (forthcoming) via `ClusteringConcordance()`, + visualized with `ConcordanceTable()` and support functions `QACol()`, `QALegend()` - `QuartetConcordance()` gains `return` parameter and fast C++ implementation. diff --git a/R/Concordance.R b/R/Concordance.R index f42590539..1a616a83c 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -184,9 +184,12 @@ QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, } } -#' @importFrom TreeDist Entropy -.Entropy <- function (...) { - Entropy(c(...) / sum(...)) +#' Re-zero a value by normalization +#' @param value value ranging from zero to one +#' @param zero new value to set as zero +#' @keywords internal +.Rezero <- function(value, zero) { + (value - zero) / (1 - zero) } #' @rdname SiteConcordance @@ -455,6 +458,166 @@ ClusteringConcordance <- function (tree, dataset, return = "edge", ) } +#' Generate colour to depict the amount and quality of observations +#' @param amount Numeric vector of values between 0 and 1, denoting the relative +#' amount of information +#' @param quality Numeric vector of values between -1 and 1, denoting the +#' quality of observations, where 0 is neutral. +#' @return `QACol()` returns an RGB hex code for a colour, where lighter colours +#' correspond to entries with a higher `amount`; unsaturated colours denote +#' a neutral `quality`; and red/cyan colours denote low/high `quality`. +#' @examples +#' amount <- runif(80, 0, 1) +#' quality <- runif(80, -1, 1) +#' plot(amount, quality, col = QACol(amount, quality), pch = 15) +#' abline(h = 0) +#' @template MRS +#' @importFrom colorspace hex polarLUV +#' @export +QACol <- function(amount, quality) { + h <- 80 + (quality * 140) + l <- amount * 88 # < 100: white can take no hue + c <- abs(quality) * .MaxChroma(h, l) + # Saturation higher than 1 risks overflowing the colour space + # Small overflows are caught via `fixup = TRUE`; large overflows will produce + # bright red errors + saturation <- 0.999 # Safe if max_chroma(floor = FALSE) slightly overestimates + saturation <- 1.16 + + hex(polarLUV( + H = as.numeric(h), + C = as.numeric(c) * saturation, + L = as.numeric(l) + ), fixup = TRUE) +} + +#' @importFrom colorspace max_chroma +.MaxChroma <- function(h, l) { + ret <- `length<-`(double(0), length(h)) + applicable <- !is.na(h) & !is.na(l) + ret[applicable] <- max_chroma(h[applicable], l[applicable]) + ret +} + +#' @rdname QACol +#' @return `QCol()` returns an RGB hex code for a colour, where darker, +#' unsaturated colours denote a neutral `quality`; +#' and red/cyan colours denote low/high `quality`. `amount` is ignored. +#' @export +QCol <- function(amount, quality) { + h <- 80 + (quality * 140) + l <- abs(quality) * 88 # < 100: white can take no hue + c <- abs(quality) * .MaxChroma(h, l) + # Saturation higher than 1 risks overflowing the colour space + # Small overflows are caught via `fixup = TRUE`; large overflows will produce + # bright red errors + saturation <- 0.999 # Safe if max_chroma(floor = FALSE) slightly overestimates + + hex(polarLUV( + H = as.numeric(h), + C = as.numeric(c) * saturation, + L = as.numeric(l) + ), fixup = TRUE) +} + +#' @rdname QACol +#' @param where Location of legend, passed to `par(fig = where)` +#' @param n Integer vector giving number of cells to plot in swatch for +#' `quality` and `amount`. +#' @inheritParams ConcordanceTable +#' @export +QALegend <- function(where = c(0.1, 0.3, 0.1, 0.3), n = 5, Col = QACol) { + oPar <- par(fig = where, new = TRUE, mar = rep(0, 4), xpd = NA) + on.exit(par(oPar)) + n <- rep(n, length.out = 2) + nA <- n[[2]] + nQ <- n[[1]] + amount <- seq(0, 1, length.out = nA) + quality <- seq(-1, 1, length.out = nQ) + mat <- outer(amount, quality, + Vectorize(function (a, q) Col(a, q))) + image(x = amount, y = quality, z = matrix(1:prod(n), nA, nQ), + col = mat, axes = FALSE, xlab = "", ylab = "") + mtext("Amount \U2192", side = 1, line = 1) + mtext("Quality \U2192", side = 2, line = 1) +} + +#' Plot concordance table +#' +#' `ConcordanceTable()` plots a concordance table +#' \insertCite{SmithConc}{TreeSearch}. +#' +#' @inheritParams ClusteringConcordance +#' @param Col Function that takes vectors `amount` and `quality` and returns +#' a vector of colours. [QCol] colours by data quality (concordance); +#' [QACol] by quality and amount of information. +#' @param largeClade Integer; if greater than 1, vertical lines will be drawn +#' at edges whose descendants are both contain more than `largeClade` leaves. +#' @param xlab Character giving a label for the x axis. +#' @param ylab Character giving a label for the y axis. +#' @param \dots Arguments to `abline`, to control the appearance of vertical +#' lines marking important edges. +#' @returns `ConcordanceTable()` invisibly returns an named list containing: +#' - `"info"`: The amount of information in each character-edge pair, in bits; +#' - `"relInfo"`: The information, normalized to the most information-rich pair; +#' - `"quality"`: The normalized mutual information of the pair; +#' - `"col"`: The colours used to plot the table. +#' +#' @references \insertAllCited{} +#' @examples +#' # Load data and tree +#' data("congreveLamsdellMatrices", package = "TreeSearch") +#' dataset <- congreveLamsdellMatrices[[1]][, 1:20] +#' tree <- referenceTree +#' +#' # Plot tree and identify nodes +#' library("TreeTools", quietly = TRUE) +#' plot(tree) +#' nodeIndex <- as.integer(rownames(as.Splits(tree))) +#' nodelabels(seq_along(nodeIndex), nodeIndex, adj = c(2, 1), +#' frame = "none", bg = NULL) +#' QALegend(where = c(0.1, 0.4, 0.1, 0.3)) +#' +#' # View information shared by characters and edges +#' ConcordanceTable(tree, dataset, largeClade = 3, col = 2, lwd = 3) +#' axis(1) +#' axis(2) +#' +#' # Visualize dataset +#' image(t(`mode<-`(PhyDatToMatrix(dataset), "numeric")), axes = FALSE, +#' xlab = "Leaf", ylab = "Character") +#' @importFrom graphics abline image mtext +#' @importFrom TreeTools CladeSizes NTip +#' @export +ConcordanceTable <- function(tree, dataset, Col = QACol, largeClade = 0, + xlab = "Edge", ylab = "Character", ...) { + cc <- ClusteringConcordance(tree, dataset, return = "all") + nodes <- seq_len(dim(cc)[[2]]) + info <- cc["hBest", , ] * cc["n", , ] + amount <- info / max(info, na.rm = TRUE) + amount[is.na(amount)] <- 0 + quality <- cc["normalized", , ] + # Plot points with incalculable quality as black, not transparent. + amount[is.na(quality)] <- 0 + quality[is.na(quality)] <- 0 + + col <- matrix(Col(amount, quality), dim(amount)[[1]], dim(amount)[[2]]) + image(nodes, seq_len(dim(cc)[[3]]), + matrix(1:prod(dim(amount)), dim(amount)[[1]]), + frame.plot = FALSE, axes = FALSE, + col = col, xlab = xlab, ylab = ylab) + + if (largeClade > 1) { + cladeSize <- CladeSizes(tree) + edge <- tree[["edge"]] + parent <- edge[, 1] + child <- edge[, 2] + bigNode <- vapply(as.integer(colnames(cc)), function (node) { + all(cladeSize[child[parent == parent[child == node]]] >= largeClade) + }, logical(1)) + abline(v = nodes[bigNode] - 0.5, ...) + } + invisible(list(info = info, amount = amount, quality = quality, col = col)) } #' @rdname SiteConcordance diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index b4ffdc786..2a0470010 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -4,8 +4,10 @@ test_that("_Concordance() handles null input", { expect_warning(expect_null(QuartetConcordance(BalancedTree(8), NULL))) expect_warning(expect_null(PhylogeneticConcordance(BalancedTree(8), NULL))) expect_warning(expect_null(ClusteringConcordance(BalancedTree(8), NULL))) - expect_warning(expect_null(MutualClusteringConcordance(BalancedTree(8), NULL))) - expect_warning(expect_null(SharedPhylogeneticConcordance(BalancedTree(8), NULL))) + expect_warning(expect_null( + MutualClusteringConcordance(BalancedTree(8), NULL))) + expect_warning(expect_null( + SharedPhylogeneticConcordance(BalancedTree(8), NULL))) }) test_that("_Concordance() handles tip mismatch", { @@ -13,6 +15,8 @@ test_that("_Concordance() handles tip mismatch", { tree <- BalancedTree(5) expect_warning(expect_null(QuartetConcordance(tree, char)), "No overlap between tree labels and dataset.") + expect_warning(expect_null(ClusteringConcordance(tree, char)), + "Could not find 't1', .* in names.dataset.") }) test_that("QuartetConcordance() works", { @@ -114,6 +118,60 @@ test_that("QuartetConcordance() handles incomplete data", { expect_equal(unname(QuartetConcordance(tree, dat)), rep(NA_real_, 5)) }) +test_that(".Rezero() works", { + expect_equal(.Rezero(seq(0, 1, by = 0.1), 0.1), -1:9 / 9) +}) + +test_that("ClusteringConcordance() gives sensible values", { + tree <- BalancedTree(8) + splits <- as.Splits(tree) + # None of these characters are informative + mataset <- matrix(c(0, 0, 0, 0, 0, 0, 0, 1, + rep("?", 8)), 8, + dimnames = list(paste0("t", 1:8), NULL)) + dat <- MatrixToPhyDat(mataset) + + expect_equal(unname(ClusteringConcordance(tree, dat)), rep(NA_real_, 5)) + + tree <- ape::read.tree(text = "((a, b, c, d, e), (f, g, h));") + split <- as.Splits(tree) + + mataset <- matrix(c(0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 1, 1, 1, # Matches split + 0, 0, 0, 0, 1, 1, 1, 1, # Consistent but not identical + 0, 0, 0, 1, 1, 1, 1, 1, # Consistent, more different + 0, 0, 0, 0, 0, 0, 1, 1, # Consistent other way + 0, 1, 0, 1, 0, 1, 0, 1, # Worst possible + 0, 0, 0, 0, rep("?", 4), # No information + 0, 0, 1, 1, rep("?", 4), # No relevant information + rep("?", 8)), 8, + dimnames = list(letters[1:8], NULL)) + dat <- MatrixToPhyDat(mataset) + cc <- ClusteringConcordance(tree, dat, return = "all")[, "10", ] + .Entropy <- function(...) { + TreeDist::Entropy(c(...) / sum(...)) + } + .NormExp <- function(a, b, ab) { + .Rezero( + (.Entropy(a) + .Entropy(b) - .Entropy(ab)) / .Entropy(a), + .ExpectedMI(a, b) / .Entropy(a) + ) + } + expect_equal(cc["normalized", ], + c(NA_real_, 1, + .NormExp(c(3, 5), c(4, 4), c(1, 3, 4)), + .NormExp(c(3, 5), c(3, 5), c(2, 3, 3)), + .NormExp(c(2, 6), c(3, 5), c(2, 1, 5)), + .NormExp(c(3, 5), c(4, 4), c(2, 1, 2, 3)), + NA, NA, NA)) + + randomset <- matrix(sample(0:1, 8 * 1000, replace = TRUE), 8, + dimnames = list(letters[1:8], NULL)) + rat <- MatrixToPhyDat(randomset) + expect_equal(ClusteringConcordance(tree, rat, normalize = TRUE), c("10" = 0), + tolerance = 0.05) +}) + dataset <- congreveLamsdellMatrices[[10]][, 1] tree <- TreeTools::NJTree(dataset) @@ -148,3 +206,11 @@ test_that("ConcordantInformation() works", { log2(3) - log2(3)), ci["ignored"]) }) + +test_that("QACol() handles input", { + expect_equal(is.na(QACol(c(0, 1, NA, 0, NA), c(0, 1, NA, NA, 0))), + c(FALSE, FALSE, TRUE, + FALSE, # No data = black, quality NA by definition + TRUE)) + expect_equal(is.na(QCol(NA, c(0, 1, NA, NA))), c(FALSE, FALSE, TRUE, TRUE)) +}) From 3226a6a1600392b010d30691c89648c8cf4726fa Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:26:20 +0000 Subject: [PATCH 08/31] Update concordance functions --- R/Concordance.R | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 1a616a83c..94a154b2e 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -660,7 +660,18 @@ PhylogeneticConcordance <- function (tree, dataset) { } #' @rdname SiteConcordance -# Mutual clustering information of each split with the split implied by each character +#' @details +#' `MutualClusteringConcordance()` treats each character as a tree. +#' Each token in the character corresponds to a node in the tree whose pendant +#' edges are the taxa with that token. +#' The Mutual Clustering Concordance is then the Mutual Clustering Information +#' \insertCite{Smith2020}{TreeSearch} of the tree thus defined with `tree`. +#' +#' @return `MutualClusteringConcordance()` returns the mutual clustering +#' concordance of each character in `dataset` with `tree`. +#' The attribute `weighted.mean` gives the mean value, weighted by the +#' information content of each character. +#' @importFrom TreeTools MatchStrings #' @importFrom TreeDist ClusteringEntropy MutualClusteringInfo #' @export MutualClusteringConcordance <- function (tree, dataset) { @@ -668,22 +679,35 @@ MutualClusteringConcordance <- function (tree, dataset) { warning("Cannot calculate concordance without `dataset`.") return(NULL) } - dataset <- dataset[TipLabels(tree)] + + dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] splits <- as.multiPhylo(as.Splits(tree)) characters <- as.multiPhylo(dataset) support <- rowSums(vapply(characters, function (char) { - trimmed <- lapply(splits, keep.tip, TipLabels(char)) + trimmed <- KeepTip(splits, TipLabels(char)) cbind(mi = MutualClusteringInfo(char, trimmed), possible = ClusteringEntropy(trimmed)) }, matrix(NA_real_, length(splits), 2)), dims = 2) + ret <- support[, 1] / support[, 2] # Return: - support[, 1] / support[, 2] + structure(ret, weighted.mean = weighted.mean(ret, support[, 2])) } #' @rdname SiteConcordance -#' @importFrom TreeTools as.multiPhylo +#' @details +#' `SharedPhylogeneticConcordance()` treats each character as a simple tree. +#' Each token in the character corresponds to a node whose pendant edges are the +#' taxa with that token. +#' The Shared Phylogenetic Concordance for each character in `dataset` is then +#' the Shared Phylogenetic Information \insertCite{Smith2020}{TreeSearch} of +#' this tree and `tree`. +#' @return `SharedPhylogeneticConcordance()` returns the shared phylogenetic +#' concordance of each character in `dataset` with `tree`. +#' The attribute `weighted.mean` gives the mean value, weighted by the +#' information content of each character. +#' @importFrom TreeTools as.multiPhylo MatchStrings #' @importFrom TreeDist ClusteringInfo SharedPhylogeneticInfo #' @export SharedPhylogeneticConcordance <- function (tree, dataset) { @@ -691,7 +715,7 @@ SharedPhylogeneticConcordance <- function (tree, dataset) { warning("Cannot calculate concordance without `dataset`.") return(NULL) } - dataset <- dataset[TipLabels(tree)] + dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] splits <- as.multiPhylo(as.Splits(tree)) characters <- as.multiPhylo(dataset) @@ -701,8 +725,9 @@ SharedPhylogeneticConcordance <- function (tree, dataset) { possible = ClusteringInfo(trimmed)) }, matrix(NA_real_, length(splits), 2)), dims = 2) + ret <- support[, 1] / support[, 2] # Return: - support[, 1] / support[, 2] + structure(ret, weighted.mean = weighted.mean(ret, support[, 2])) } #' Evaluate the concordance of information between a tree and a dataset @@ -737,10 +762,10 @@ SharedPhylogeneticConcordance <- function (tree, dataset) { #' myMatrix <- congreveLamsdellMatrices[[10]] #' ConcordantInformation(TreeTools::NJTree(myMatrix), myMatrix) #' @template MRS -#' @importFrom TreeTools Log2UnrootedMult Log2Unrooted +#' @importFrom TreeTools Log2UnrootedMult Log2Unrooted MatchStrings #' @export ConcordantInformation <- function (tree, dataset) { - dataset <- dataset[TipLabels(tree)] + dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] originalInfo <- sum(apply(PhyDatToMatrix(dataset), 2, CharacterInformation)) dataset <- PrepareDataProfile(dataset) From c29e8d321de91f6406c5ba5a0487466c343788ab Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:27:22 +0000 Subject: [PATCH 09/31] Recall MaximumLength --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 8ac92f6b1..fbbacc43b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -21,6 +21,7 @@ - Support single-character matrices in `ClusteringConcordance()` - Fix `DoNothing(x)` to return `x` (not `NULL`) - Remove unused `delete_rawdata()` due to implementation issues. +- Port `MaximumLength()` to C++ to handle more characters, more efficiently. # TreeSearch 1.6.1 (2025-06-10) From e4284fa24edf7f8c2b40c3ae57c59c1a8b2c2ad9 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Tue, 13 Jan 2026 16:27:47 +0000 Subject: [PATCH 10/31] document() --- NAMESPACE | 10 +++++ man/ConcordanceTable.Rd | 78 +++++++++++++++++++++++++++++++++ man/Consistency.Rd | 10 ++--- man/ExpectedLength.Rd | 4 +- man/QACol.Rd | 52 ++++++++++++++++++++++ man/SiteConcordance.Rd | 95 ++++++++++++++++++++++++++++++++++++++++- man/dot-Rezero.Rd | 17 ++++++++ 7 files changed, 257 insertions(+), 9 deletions(-) create mode 100644 man/ConcordanceTable.Rd create mode 100644 man/QACol.Rd create mode 100644 man/dot-Rezero.Rd diff --git a/NAMESPACE b/NAMESPACE index 0958569b9..e542e3959 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -144,9 +144,11 @@ importFrom(TreeDist,ClusteringInfoDistance) importFrom(TreeDist,Entropy) importFrom(TreeDist,MutualClusteringInfo) importFrom(TreeDist,SharedPhylogeneticInfo) +importFrom(TreeDist,entropy_int) importFrom(TreeTools,AddTipEverywhere) importFrom(TreeTools,AddUnconstrained) importFrom(TreeTools,CharacterInformation) +importFrom(TreeTools,CladeSizes) importFrom(TreeTools,CladisticInfo) importFrom(TreeTools,CompatibleSplits) importFrom(TreeTools,Consensus) @@ -194,6 +196,7 @@ importFrom(TreeTools,TipLabels) importFrom(TreeTools,TreeIsRooted) importFrom(TreeTools,as.Splits) importFrom(TreeTools,as.multiPhylo) +importFrom(abind,abind) importFrom(ape,consensus) importFrom(ape,keep.tip) importFrom(ape,nodelabels) @@ -201,6 +204,7 @@ importFrom(ape,plot.phylo) importFrom(ape,read.nexus) importFrom(ape,root) importFrom(ape,write.nexus) +importFrom(base64enc,base64encode) importFrom(cli,cli_alert) importFrom(cli,cli_alert_danger) importFrom(cli,cli_alert_info) @@ -212,11 +216,17 @@ importFrom(cli,cli_progress_done) importFrom(cli,cli_progress_update) importFrom(cluster,pam) importFrom(cluster,silhouette) +importFrom(colorspace,hex) +importFrom(colorspace,max_chroma) +importFrom(colorspace,polarLUV) importFrom(fastmap,fastmap) importFrom(fastmatch,"%fin%") importFrom(fastmatch,fmatch) importFrom(fs,path_sanitize) importFrom(future,future) +importFrom(graphics,abline) +importFrom(graphics,image) +importFrom(graphics,mtext) importFrom(graphics,par) importFrom(promises,future_promise) importFrom(protoclust,protoclust) diff --git a/man/ConcordanceTable.Rd b/man/ConcordanceTable.Rd new file mode 100644 index 000000000..5239f1bcb --- /dev/null +++ b/man/ConcordanceTable.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Concordance.R +\name{ConcordanceTable} +\alias{ConcordanceTable} +\title{Plot concordance table} +\usage{ +ConcordanceTable( + tree, + dataset, + Col = QACol, + largeClade = 0, + xlab = "Edge", + ylab = "Character", + ... +) +} +\arguments{ +\item{tree}{A tree of class \code{\link[ape:read.tree]{phylo}}.} + +\item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class +\code{phyDat}, whose names correspond to the labels of any accompanying tree. +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}()}. +Additive (ordered) characters can be handled using +\code{\link[TreeTools]{Decompose}()}.} + +\item{Col}{Function that takes vectors \code{amount} and \code{quality} and returns +a vector of colours. \link{QCol} colours by data quality (concordance); +\link{QACol} by quality and amount of information.} + +\item{largeClade}{Integer; if greater than 1, vertical lines will be drawn +at edges whose descendants are both contain more than \code{largeClade} leaves.} + +\item{xlab}{Character giving a label for the x axis.} + +\item{ylab}{Character giving a label for the y axis.} + +\item{\dots}{Arguments to \code{abline}, to control the appearance of vertical +lines marking important edges.} +} +\value{ +\code{ConcordanceTable()} invisibly returns an named list containing: +\itemize{ +\item \code{"info"}: The amount of information in each character-edge pair, in bits; +\item \code{"relInfo"}: The information, normalized to the most information-rich pair; +\item \code{"quality"}: The normalized mutual information of the pair; +\item \code{"col"}: The colours used to plot the table. +} +} +\description{ +\code{ConcordanceTable()} plots a concordance table +\insertCite{SmithConc}{TreeSearch}. +} +\examples{ +# Load data and tree +data("congreveLamsdellMatrices", package = "TreeSearch") +dataset <- congreveLamsdellMatrices[[1]][, 1:20] +tree <- referenceTree + +# Plot tree and identify nodes +library("TreeTools", quietly = TRUE) +plot(tree) +nodeIndex <- as.integer(rownames(as.Splits(tree))) +nodelabels(seq_along(nodeIndex), nodeIndex, adj = c(2, 1), + frame = "none", bg = NULL) +QALegend(where = c(0.1, 0.4, 0.1, 0.3)) + +# View information shared by characters and edges +ConcordanceTable(tree, dataset, largeClade = 3, col = 2, lwd = 3) +axis(1) +axis(2) + +# Visualize dataset +image(t(`mode<-`(PhyDatToMatrix(dataset), "numeric")), axes = FALSE, + xlab = "Leaf", ylab = "Character") +} +\references{ +\insertAllCited{} +} diff --git a/man/Consistency.Rd b/man/Consistency.Rd index f0fb87a77..73ff05282 100644 --- a/man/Consistency.Rd +++ b/man/Consistency.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/Consistency.R \name{Consistency} \alias{Consistency} -\title{Consistency / retention "indices"} +\title{Consistency and retention "indices"} \usage{ Consistency(dataset, tree, nRelabel = 0, compress = FALSE) } @@ -16,8 +16,8 @@ Additive (ordered) characters can be handled using \item{tree}{A tree of class \code{\link[ape:read.tree]{phylo}}.} \item{nRelabel}{Integer specifying how many times to relabel leaves when -estimating null tree length for \acronym{RHI} calculation. -\insertCite{Steell2023;textual}{TreeSearch} recommend 1000, but suggest that +computing MCMC estimate of null tree length for \acronym{RHI} calculation. +\insertCite{Steell2025;textual}{TreeSearch} recommend 1000, but suggest that 100 may suffice. If zero (the default), the \acronym{RHI} is not calculated.} @@ -53,7 +53,7 @@ the tree is optimal. Note that as the possible values of the consistency index do not range from zero to one, it is not an index in the mathematical sense of the term. Shortcomings of this measure are widely documented -\insertCite{Archie1989,Brooks1986,Steell2023}{TreeSearch}. +\insertCite{Archie1989,Brooks1986,Steell2025}{TreeSearch}. The maximum length of a character (see \code{\link[=MaximumLength]{MaximumLength()}}) is the number of steps in a parsimonious reconstruction on the longest possible tree @@ -69,7 +69,7 @@ retention indices; it rescales the consistency index such that its range of possible values runs from zero (least consistent) to one (perfectly consistent). -The \strong{relative homoplasy index} \insertCite{Steell2023}{TreeSearch} is +The \strong{relative homoplasy index} \insertCite{Steell2025}{TreeSearch} is the ratio of the observed excess tree length to the excess tree length due to chance, taken as the median score of a character when the leaves of the given tree are randomly shuffled. diff --git a/man/ExpectedLength.Rd b/man/ExpectedLength.Rd index 1b6fc4a6b..5c7cc3659 100644 --- a/man/ExpectedLength.Rd +++ b/man/ExpectedLength.Rd @@ -16,8 +16,8 @@ Additive (ordered) characters can be handled using \item{tree}{A tree of class \code{\link[ape:read.tree]{phylo}}.} \item{nRelabel}{Integer specifying how many times to relabel leaves when -estimating null tree length for \acronym{RHI} calculation. -\insertCite{Steell2023;textual}{TreeSearch} recommend 1000, but suggest that +computing MCMC estimate of null tree length for \acronym{RHI} calculation. +\insertCite{Steell2025;textual}{TreeSearch} recommend 1000, but suggest that 100 may suffice. If zero (the default), the \acronym{RHI} is not calculated.} diff --git a/man/QACol.Rd b/man/QACol.Rd new file mode 100644 index 000000000..bb686a870 --- /dev/null +++ b/man/QACol.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Concordance.R +\name{QACol} +\alias{QACol} +\alias{QCol} +\alias{QALegend} +\title{Generate colour to depict the amount and quality of observations} +\usage{ +QACol(amount, quality) + +QCol(amount, quality) + +QALegend(where = c(0.1, 0.3, 0.1, 0.3), n = 5, Col = QACol) +} +\arguments{ +\item{amount}{Numeric vector of values between 0 and 1, denoting the relative +amount of information} + +\item{quality}{Numeric vector of values between -1 and 1, denoting the +quality of observations, where 0 is neutral.} + +\item{where}{Location of legend, passed to \code{par(fig = where)}} + +\item{n}{Integer vector giving number of cells to plot in swatch for +\code{quality} and \code{amount}.} + +\item{Col}{Function that takes vectors \code{amount} and \code{quality} and returns +a vector of colours. \link{QCol} colours by data quality (concordance); +\link{QACol} by quality and amount of information.} +} +\value{ +\code{QACol()} returns an RGB hex code for a colour, where lighter colours +correspond to entries with a higher \code{amount}; unsaturated colours denote +a neutral \code{quality}; and red/cyan colours denote low/high \code{quality}. + +\code{QCol()} returns an RGB hex code for a colour, where darker, +unsaturated colours denote a neutral \code{quality}; +and red/cyan colours denote low/high \code{quality}. \code{amount} is ignored. +} +\description{ +Generate colour to depict the amount and quality of observations +} +\examples{ +amount <- runif(80, 0, 1) +quality <- runif(80, -1, 1) +plot(amount, quality, col = QACol(amount, quality), pch = 15) +abline(h = 0) +} +\author{ +\href{https://smithlabdurham.github.io/}{Martin R. Smith} +(\href{mailto:martin.smith@durham.ac.uk}{martin.smith@durham.ac.uk}) +} diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 72fb08f13..50e123d2a 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -11,7 +11,7 @@ \usage{ QuartetConcordance(tree, dataset = NULL, weight = TRUE, return = "mean") -ClusteringConcordance(tree, dataset) +ClusteringConcordance(tree, dataset, return = "edge", normalize = TRUE) PhylogeneticConcordance(tree, dataset) @@ -30,6 +30,83 @@ Additive (ordered) characters can be handled using \item{weight}{Logical specifying whether to weight sites according to the number of quartets they are decisive for.} + +\item{return}{Character specifying what to return. +\itemize{ +\item \code{"mean"} returns the mean concordance index at each split across all sites. +\item \code{"all"} returns all values calculated during the working for each site at +each split. +}} + +\item{normalize}{Logical or numeric; if \code{TRUE} the mutual information will be +normalized such that zero corresponds to the expected mutual information of +a randomly drawn character with the same distribution of tokens. +If \code{FALSE}, zero will correspond to zero mutual information, +even if this is not possible to accomplish in practice. +The exact analytical solution, whilst quick, does not account for +non-independence between splits. This is a less important factor for larger +trees, and is negligible above ~200 leaves. For small trees, the expected +value for random trees can be estimated by resampling relabelled trees. +To conduct \emph{n} resamplings, set \code{normalize = n}. + +For \code{return = "char"}, \code{"tree"}, values will be normalized such that 1 +corresponds to the maximum possible value, and 0 to the expected value. +If \code{normalize = TRUE}, this will be the expected value for a random +character on the given tree. If \code{normalize} is numeric, the expected value +will be estimated by fitting the character to \code{n} uniformly random trees.} +} +\value{ +\code{ClusteringConcordance(return = "all")} returns a 3D array where each +slice corresponds to a character (site), each column to a tree split, and +each row to a different information measure. The \code{normalized} row gives the +normalized mutual information between each split-character pair, scaled so +that 1.0 corresponds to \code{hBest} (the theoretical maximum mutual information, +being the minimum of \code{hSplit} and \code{hChar}) and 0.0 corresponds to \code{miRand} +(the expected mutual information under random association). \code{hSplit} gives +the entropy (information content) of each split's bipartition; \code{hChar} gives +the entropy of each character's state distribution; \code{hJoint} gives the joint +entropy of the split-character confusion matrix; \code{mi} gives the raw mutual +information; and \code{n} records the number of informative observations. +Negative normalized values indicate observed mutual information below random +expectation. \code{NA} is returned when \code{hBest = 0} (no information potential). + +\code{ClusteringConcordance(return = "edge")} returns a vector where each element +corresponds to a split (an edge of the tree) and gives the normalized mutual +information between that split and the character data, averaged across all +characters. +When \code{normalize = TRUE} (default), values are scaled relative to random +expectation; when \code{FALSE}, raw mutual information normalized by \code{hBest} is +returned. + +\code{ClusteringConcordance(return = "char")} returns a vector where each element +corresponds to a character (site) and gives the entropy-weighted average +normalized mutual information between that character and all tree splits. +Characters with higher information content receive proportionally more weight +from splits that can potentially convey more information about them. + +\code{ClusteringConcordance(return = "tree")} returns a single value representing +the overall concordance between the tree topology and the character data. +This averages the fit of the best-matching split for each character. +This is probably biased. I have not identified a circumstance in which it +produces meaningful results. Let me know if you find one. + +I had previously considered calculating +the entropy-weighted average of normalized mutual +information across all split-character pairs, where each pair contributes +proportionally to its potential information content. +The problem here is that imperfect matches between compatible splits +come to dominate, resulting in a small score that gets smaller as trees get +larger, even with a perfect fit. + +\code{MutualClusteringConcordance()} returns the mutual clustering +concordance of each character in \code{dataset} with \code{tree}. +The attribute \code{weighted.mean} gives the mean value, weighted by the +information content of each character. + +\code{SharedPhylogeneticConcordance()} returns the shared phylogenetic +concordance of each character in \code{dataset} with \code{tree}. +The attribute \code{weighted.mean} gives the mean value, weighted by the +information content of each character. } \description{ The site concordance factor \insertCite{Minh2020}{TreeSearch} is a measure @@ -66,6 +143,13 @@ but this has not yet been tested. \strong{NOTE:} These functions are under development. They are incompletely tested, and may change without notice. Complete documentation and discussion will follow in due course. + +\code{MutualClusteringConcordance()} treats each character as a tree. +Each token in the character corresponds to a node in the tree whose pendant +edges are the taxa with that token. +The Mutual Clustering Concordance is then the Mutual Clustering Information +\insertCite{Smith2020}{TreeSearch} of the tree thus defined with \code{tree}. + \code{SharedPhylogeneticConcordance()} treats each character as a simple tree. Each token in the character corresponds to a node whose pendant edges are the taxa with that token. @@ -76,7 +160,7 @@ this tree and \code{tree}. \examples{ data("congreveLamsdellMatrices", package = "TreeSearch") dataset <- congreveLamsdellMatrices[[1]][, 1:20] -tree <- referenceTree +tree <- TreeSearch::referenceTree qc <- QuartetConcordance(tree, dataset) cc <- ClusteringConcordance(tree, dataset) pc <- PhylogeneticConcordance(tree, dataset) @@ -102,11 +186,18 @@ ape::write.tree(tree) # or write.nexus(tree, file = "mytree.nex") # Display correlation between concordance factors pairs(cbind(qc, cc, pc, spc, mcc), asp = 1) +data(congreveLamsdellMatrices) +myMatrix <- congreveLamsdellMatrices[[10]] +ClusteringConcordance(TreeTools::NJTree(myMatrix), myMatrix) } \references{ \insertAllCited{} } \seealso{ +\itemize{ +\item \code{\link[=Consistency]{Consistency()}} +} + Other split support functions: \code{\link{JackLabels}()}, \code{\link{Jackknife}()}, diff --git a/man/dot-Rezero.Rd b/man/dot-Rezero.Rd new file mode 100644 index 000000000..9fb869b79 --- /dev/null +++ b/man/dot-Rezero.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Concordance.R +\name{.Rezero} +\alias{.Rezero} +\title{Re-zero a value by normalization} +\usage{ +.Rezero(value, zero) +} +\arguments{ +\item{value}{value ranging from zero to one} + +\item{zero}{new value to set as zero} +} +\description{ +Re-zero a value by normalization +} +\keyword{internal} From c459db371366583465519cba8d2b9784fd20337d Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 14 Jan 2026 11:32:41 +0000 Subject: [PATCH 11/31] abind --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 911f634f8..b43588160 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,7 +33,8 @@ URL: https://ms609.github.io/TreeSearch/ (doc), https://github.com/ms609/TreeSearch/ (devel) BugReports: https://github.com/ms609/TreeSearch/issues/ Depends: R (>= 4.0) -Imports: +Imports: + abind, ape (>= 5.6), cli (>= 3.0), cluster, From ae0f7a28814dfac092eb31b84c7c25a7f6cbac7a Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 14 Jan 2026 12:54:14 +0000 Subject: [PATCH 12/31] base64enc, colorspace --- DESCRIPTION | 2 ++ 1 file changed, 2 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index b43588160..7e9d2b56f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,8 +36,10 @@ Depends: R (>= 4.0) Imports: abind, ape (>= 5.6), + base64enc, cli (>= 3.0), cluster, + colorspace, fastmap, fastmatch (>= 1.1.3), fs, From bfc3dd41c10a987016210457090e556d447cea8b Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 08:57:56 +0000 Subject: [PATCH 13/31] Placeholder SmithConc ref --- inst/REFERENCES.bib | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 52b216f40..b26a7a984 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -509,6 +509,13 @@ @article{Smith2020 doi = {10.1093/bioinformatics/btaa614} } +@article{SmithConc, + title = {Which characters support which clades? Exploring the distribution of phylogenetic signal using concordant information}, + author = {Smith, Martin R.}, + journal = {Forthcoming}, + year = {2026} +} + @article{SmithCons, title = {Using information theory to detect rogue taxa and improve consensus trees}, author = {Smith, Martin R.}, From b63ca5d267016f9033c2f2fe469f8bc0935acc9f Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 09:43:31 +0000 Subject: [PATCH 14/31] Fix ambig handling in `QuartetConcordance()` --- R/Concordance.R | 15 +++++++-------- man/SiteConcordance.Rd | 4 ++-- tests/testthat/test-Concordance.R | 1 + 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 94a154b2e..c0f91baec 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -27,9 +27,8 @@ #' other implementations (e.g. IQ-TREE) follow #' \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample #' of quartets for a faster, if potentially less accurate, computation. -#TODO -#' `QuartetConcordance()` in principle supports ambiguous character states, -#' but this has not yet been tested. +#' Ambiguous and inapplicable tokens are treated as containing no grouping information +#' (i.e. `(02)` or `-` are each treated as `?`). #' # `ClusteringConcordance()` and `PhylogeneticConcordance()` respectively report # the proportion of clustering information and phylogenetic information @@ -114,12 +113,12 @@ QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) charLevels <- attr(dataset, "allLevels") - charLevels[rowSums(attr(dataset, "contrast")) > 1] <- NA - charLevels <- setdiff(charLevels, "-") + isAmbig <- rowSums(attr(dataset, "contrast")) > 1 + isInapp <- charLevels == "-" + nonGroupingLevels <- charLevels[isAmbig | isInapp] + characters[characters %in% nonGroupingLevels] <- NA - charInt <- matrix(match(characters, charLevels), - nrow = nrow(characters), - dimnames = dimnames(characters)) + charInt <- `mode<-`(characters, "integer") raw_counts <- quartet_concordance(logiSplits, charInt) num <- raw_counts$concordant diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 50e123d2a..a0ac9b177 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -137,8 +137,8 @@ If \code{weight = FALSE}, the split concordance will be mean(75\%, 25\%) = 50\%. other implementations (e.g. IQ-TREE) follow \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample of quartets for a faster, if potentially less accurate, computation. -\code{QuartetConcordance()} in principle supports ambiguous character states, -but this has not yet been tested. +Ambiguous and inapplicable tokens are treated as containing no grouping information +(i.e. \code{(02)} or \code{-} are each treated as \verb{?}). \strong{NOTE:} These functions are under development. They are incompletely tested, and may change without notice. diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index 2a0470010..48e2d579b 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -80,6 +80,7 @@ test_that("QuartetConcordance() works", { }) test_that("QuartetConcordance() handles ambiguity", { + library("TreeTools", quietly = TRUE) tree <- BalancedTree(12) splits <- as.Splits(tree) mataset <- matrix(c(0, 0, "{01}", 0, 0, "{01}", 1, 1, "-", 1, 1, "-", From 95b4b2410e2a24b1f627a9e78f8a8662e5770d6c Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 09:57:14 +0000 Subject: [PATCH 15/31] Clarify and avoid "splling error" --- R/Concordance.R | 13 +++++++------ man/SiteConcordance.Rd | 13 +++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index c0f91baec..0916666d4 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -200,12 +200,13 @@ QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, #' normalized such that zero corresponds to the expected mutual information of #' a randomly drawn character with the same distribution of tokens. #' If `FALSE`, zero will correspond to zero mutual information, -#' even if this is not possible to accomplish in practice. -#' The exact analytical solution, whilst quick, does not account for -#' non-independence between splits. This is a less important factor for larger -#' trees, and is negligible above ~200 leaves. For small trees, the expected -#' value for random trees can be estimated by resampling relabelled trees. -#' To conduct _n_ resamplings, set `normalize = n`. +#' even if this is not achievable in practice. +#' The exact analytical solution, though fast, does not account for +#' non-independence between splits. This limitation is minor for larger +#' trees, and becomes negligible for trees with more than ~200 leaves. +#' For smaller trees, the expected value for random trees can be approximated +#' by resampling relabelled trees. Setting `normalize = n` will approximate the +#' expected value based on _n_ samples. #' #' For `return = "char"`, `"tree"`, values will be normalized such that 1 #' corresponds to the maximum possible value, and 0 to the expected value. diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index a0ac9b177..75dbdbff1 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -42,12 +42,13 @@ each split. normalized such that zero corresponds to the expected mutual information of a randomly drawn character with the same distribution of tokens. If \code{FALSE}, zero will correspond to zero mutual information, -even if this is not possible to accomplish in practice. -The exact analytical solution, whilst quick, does not account for -non-independence between splits. This is a less important factor for larger -trees, and is negligible above ~200 leaves. For small trees, the expected -value for random trees can be estimated by resampling relabelled trees. -To conduct \emph{n} resamplings, set \code{normalize = n}. +even if this is not achievable in practice. +The exact analytical solution, though fast, does not account for +non-independence between splits. This limitation is minor for larger +trees, and becomes negligible for trees with more than ~200 leaves. +For smaller trees, the expected value for random trees can be approximated +by resampling relabelled trees. Setting \code{normalize = n} will approximate the +expected value based on \emph{n} samples. For \code{return = "char"}, \code{"tree"}, values will be normalized such that 1 corresponds to the maximum possible value, and 0 to the expected value. From 4e49f6885aeabbe185704e8f6af6151b434f230b Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 10:14:38 +0000 Subject: [PATCH 16/31] rm misplaced lines --- tests/testthat/test-Concordance.R | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index 48e2d579b..6f46b05eb 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -173,12 +173,6 @@ test_that("ClusteringConcordance() gives sensible values", { tolerance = 0.05) }) -dataset <- congreveLamsdellMatrices[[10]][, 1] -tree <- TreeTools::NJTree(dataset) - -ConcordantInformation(tree, dataset)["noise"] -TreeLength(tree, dataset, concavity = "prof") - test_that("ConcordantInformation() works", { data(congreveLamsdellMatrices) dat <- congreveLamsdellMatrices[[10]] From fd1192e9674338874a00f7f908bcb17ddb9acca6 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 10:31:32 +0000 Subject: [PATCH 17/31] Optimize quart_conc --- DESCRIPTION | 2 +- benchmark/bench-quartet-conc.R | 21 ++++++ src/quartet_concordance.cpp | 128 ++++++++++++++------------------- 3 files changed, 77 insertions(+), 74 deletions(-) create mode 100644 benchmark/bench-quartet-conc.R diff --git a/DESCRIPTION b/DESCRIPTION index 7e9d2b56f..992de6f1b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeSearch Title: Phylogenetic Analysis with Discrete Character Data -Version: 1.7.0 +Version: 1.7.0.9001 Authors@R: c( person( "Martin R.", 'Smith', diff --git a/benchmark/bench-quartet-conc.R b/benchmark/bench-quartet-conc.R new file mode 100644 index 000000000..510ed140b --- /dev/null +++ b/benchmark/bench-quartet-conc.R @@ -0,0 +1,21 @@ +library("TreeTools") +library("TreeSearch") +data("congreveLamsdellMatrices", package = "TreeSearch") +dataset <- congreveLamsdellMatrices[[42]] +someNA <- PhyDatToMatrix(dataset) +someNA[sample.int(length(someNA), 444)] <- "?" +someNA <- MatrixToPhyDat(someNA) +tree <- TreeSearch::referenceTree + +bench::mark( + QuartetConcordance(tree, dataset), + QuartetConcordance(tree, someNA), check = FALSE +) + +# +# A tibble: 2 × 13 + # expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory + # +# 1 QuartetConcorda… 237ms 239ms 4.19 2.01MB 2.09 2 1 478ms +# 2 QuartetConcorda… 233ms 236ms 4.23 1.88MB 2.12 2 1 472ms +# ℹ 2 more variables: time , gc diff --git a/src/quartet_concordance.cpp b/src/quartet_concordance.cpp index 32be65b79..463140874 100644 --- a/src/quartet_concordance.cpp +++ b/src/quartet_concordance.cpp @@ -1,103 +1,85 @@ #include -#include #include using namespace Rcpp; // [[Rcpp::export]] List quartet_concordance(const LogicalMatrix splits, const IntegerMatrix characters) { - int n_splits = splits.ncol(); - int n_chars = characters.ncol(); - int n_taxa = splits.nrow(); + const int n_splits = splits.ncol(); + const int n_chars = characters.ncol(); + const int n_taxa = splits.nrow(); - // Storage for results - // Using NumericMatrix to safely hold potentially large counts (double precision) NumericMatrix concordant(n_splits, n_chars); NumericMatrix decisive(n_splits, n_chars); - - // Loop over every Split (s) and every Character (c) - for (int s = 0; s < n_splits; ++s) { - for (int c = 0; c < n_chars; ++c) { - - // 1. Build Contingency Table manually - // Map: Character State (int) -> pair - // We use a map to handle arbitrary state integers (0, 1, 2, or 10, 20...). - std::map> counts; - + + // Pre-allocate vectors to avoid repeated allocations in the inner loop + // We'll use these to store the counts for each state. + // 32 is usually plenty for states; if you expect more, use a larger constant. + std::vector n0(32), n1(32); + std::vector active_states; + active_states.reserve(32); + + for (int c = 0; c < n_chars; ++c) { + std::vector char_col(n_taxa); + for (int t = 0; t < n_taxa; ++t) char_col[t] = characters(t, c); + + for (int s = 0; s < n_splits; ++s) { + active_states.clear(); + for (int t = 0; t < n_taxa; ++t) { - int char_state = characters(t, c); + int state = char_col[t]; + if (IntegerVector::is_na(state)) continue; - // Skip NAs (ambiguous characters) - if (IntegerVector::is_na(char_state)) continue; + if (state >= (int)n0.size()) { + n0.resize(state + 1, 0); + n1.resize(state + 1, 0); + } - // Check split membership (FALSE=0, TRUE=1) + if (n0[state] == 0 && n1[state] == 0) { + active_states.push_back(state); + } + if (splits(t, s)) { - counts[char_state].second++; + n1[state]++; } else { - counts[char_state].first++; + n0[state]++; } } - - // If fewer than 2 states are present, no quartets can be decisive - if (counts.size() < 2) { - concordant(s, c) = 0; - decisive(s, c) = 0; - continue; - } - - // 2. Calculate Combinatorics - // We need to sum over combinations without O(N^2) loops. - - // Variables for Concordance: sum( choose(n0i, 2) * sum(choose(n1j, 2)) ) where j != i - // This is equivalent to: sum( choose(n0i, 2) * (Total_Choose_N1 - choose(n1i, 2)) ) + + // 2. Combinatorics logic double total_choose_n1 = 0; - std::vector choose_n0_vec; - std::vector choose_n1_vec; - choose_n0_vec.reserve(counts.size()); - choose_n1_vec.reserve(counts.size()); - - // Variables for Discordance: sum( n0i*n1i * n0j*n1j ) for i < j - // This is equivalent to: 0.5 * ( (sum P)^2 - sum(P^2) ) where P_k = n0k * n1k double sum_prod = 0; double sum_sq_prod = 0; - - for (auto const& [state, cnt] : counts) { - double n0 = cnt.first; - double n1 = cnt.second; - - // Concordance Pre-calcs - double cn0 = n0 * (n0 - 1) / 2.0; - double cn1 = n1 * (n1 - 1) / 2.0; - - choose_n0_vec.push_back(cn0); - choose_n1_vec.push_back(cn1); - total_choose_n1 += cn1; + + for (int state : active_states) { + double cnt0 = n0[state]; + double cnt1 = n1[state]; - // Discordance Pre-calcs - double p = n0 * n1; + total_choose_n1 += (cnt1 * (cnt1 - 1.0) / 2.0); + double p = cnt0 * cnt1; sum_prod += p; sum_sq_prod += p * p; } - - // Finalize Concordance + double conc_val = 0; - for (size_t k = 0; k < choose_n0_vec.size(); ++k) { - // Pairs of (0,0) from state K and (1,1) from any OTHER state - conc_val += choose_n0_vec[k] * (total_choose_n1 - choose_n1_vec[k]); + for (int state : active_states) { + double cnt0 = n0[state]; + double cnt1 = n1[state]; + double cn0 = cnt0 * (cnt0 - 1.0) / 2.0; + double cn1 = cnt1 * (cnt1 - 1.0) / 2.0; + conc_val += cn0 * (total_choose_n1 - cn1); } - - // Finalize Discordance - double disc_val = 0.5 * (sum_prod * sum_prod - sum_sq_prod); - - // Store results + concordant(s, c) = conc_val; - // Decisive = Concordant + Discordant - decisive(s, c) = conc_val + disc_val; + decisive(s, c) = conc_val + (0.5 * (sum_prod * sum_prod - sum_sq_prod)); + + // Cleanup using the tracked active states + for (int state : active_states) { + n0[state] = 0; + n1[state] = 0; + } } } - - return List::create( - _["concordant"] = concordant, - _["decisive"] = decisive - ); + + return List::create(_["concordant"] = concordant, _["decisive"] = decisive); } From c021c5f82eff469d298db7eaf349314a6a7921ef Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 10:37:10 +0000 Subject: [PATCH 18/31] Update quartet_concordance.cpp --- src/quartet_concordance.cpp | 53 +++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/src/quartet_concordance.cpp b/src/quartet_concordance.cpp index 463140874..dd3cbb495 100644 --- a/src/quartet_concordance.cpp +++ b/src/quartet_concordance.cpp @@ -12,14 +12,13 @@ List quartet_concordance(const LogicalMatrix splits, const IntegerMatrix charact NumericMatrix concordant(n_splits, n_chars); NumericMatrix decisive(n_splits, n_chars); - // Pre-allocate vectors to avoid repeated allocations in the inner loop - // We'll use these to store the counts for each state. - // 32 is usually plenty for states; if you expect more, use a larger constant. - std::vector n0(32), n1(32); + // Pre-allocate buffers + std::vector n0(32, 0), n1(32, 0); std::vector active_states; active_states.reserve(32); for (int c = 0; c < n_chars; ++c) { + // Cache character column for memory locality std::vector char_col(n_taxa); for (int t = 0; t < n_taxa; ++t) char_col[t] = characters(t, c); @@ -46,34 +45,32 @@ List quartet_concordance(const LogicalMatrix splits, const IntegerMatrix charact } } - // 2. Combinatorics logic - double total_choose_n1 = 0; - double sum_prod = 0; - double sum_sq_prod = 0; + if (active_states.size() >= 2) { + double total_choose_n1 = 0; + double sum_prod = 0; + double sum_sq_prod = 0; - for (int state : active_states) { - double cnt0 = n0[state]; - double cnt1 = n1[state]; - - total_choose_n1 += (cnt1 * (cnt1 - 1.0) / 2.0); - double p = cnt0 * cnt1; - sum_prod += p; - sum_sq_prod += p * p; - } + for (int state : active_states) { + double cnt1 = n1[state]; + total_choose_n1 += (cnt1 * (cnt1 - 1.0) / 2.0); + + double p = (double)n0[state] * cnt1; + sum_prod += p; + sum_sq_prod += p * p; + } - double conc_val = 0; - for (int state : active_states) { - double cnt0 = n0[state]; - double cnt1 = n1[state]; - double cn0 = cnt0 * (cnt0 - 1.0) / 2.0; - double cn1 = cnt1 * (cnt1 - 1.0) / 2.0; - conc_val += cn0 * (total_choose_n1 - cn1); + double conc_val = 0; + for (int state : active_states) { + double cn0 = (double)n0[state] * (n0[state] - 1.0) / 2.0; + double cn1 = (double)n1[state] * (n1[state] - 1.0) / 2.0; + conc_val += cn0 * (total_choose_n1 - cn1); + } + + concordant(s, c) = conc_val; + decisive(s, c) = conc_val + (0.5 * (sum_prod * sum_prod - sum_sq_prod)); } - concordant(s, c) = conc_val; - decisive(s, c) = conc_val + (0.5 * (sum_prod * sum_prod - sum_sq_prod)); - - // Cleanup using the tracked active states + // Reset buffers for next split for (int state : active_states) { n0[state] = 0; n1[state] = 0; From 4820cdc5d6fe9144e86e3834f4b32300c07f3095 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 10:39:09 +0000 Subject: [PATCH 19/31] Temp install --- benchmark/bench-quartet-conc.R | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/benchmark/bench-quartet-conc.R b/benchmark/bench-quartet-conc.R index 510ed140b..46a959ffd 100644 --- a/benchmark/bench-quartet-conc.R +++ b/benchmark/bench-quartet-conc.R @@ -1,5 +1,10 @@ library("TreeTools") -library("TreeSearch") + +tmp_lib <- tempfile(pattern = "lib") +dir.create(tmp_lib) +devtools::install(args = paste0("--library=", tmp_lib)) +library("TreeSearch", lib.loc = tmp_lib) + data("congreveLamsdellMatrices", package = "TreeSearch") dataset <- congreveLamsdellMatrices[[42]] someNA <- PhyDatToMatrix(dataset) @@ -11,11 +16,3 @@ bench::mark( QuartetConcordance(tree, dataset), QuartetConcordance(tree, someNA), check = FALSE ) - -# -# A tibble: 2 × 13 - # expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory - # -# 1 QuartetConcorda… 237ms 239ms 4.19 2.01MB 2.09 2 1 478ms -# 2 QuartetConcorda… 233ms 236ms 4.23 1.88MB 2.12 2 1 472ms -# ℹ 2 more variables: time , gc From 515b500009c9e75258107bd9a4d1aa87d287cf77 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 10:56:49 +0000 Subject: [PATCH 20/31] Unanchor site conc header --- R/Concordance.R | 37 ++++++++++++++++++------------------- man/SiteConcordance.Rd | 23 +++++++++++------------ 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 0916666d4..a454999cb 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -1,8 +1,8 @@ -#' Calculate site concordance factor +#' Concordance factors #' -#' The site concordance factor \insertCite{Minh2020}{TreeSearch} is a measure -#' of the strength of support that the dataset presents for a given split in a -#' tree. +#' Concordance measures the strength of support that characters in a dataset +#' present for each split (=edge/branch) in a tree +#' \insertCite{Minh2020;SmithConc}{TreeSearch}. #' #' `QuartetConcordance()` is the proportion of quartets (sets of four leaves) #' that are decisive for a split which are also concordant with it. @@ -36,21 +36,12 @@ # that is reflected in each split. # These give smaller values because a split may be compatible with a character # without being identical to it. -#TODO More thought / explanation needed. #' #TODO Finally, `ProfileConcordance()` (to follow) #' -#' **NOTE:** These functions are under development. They are incompletely -#' tested, and may change without notice. -#' Complete documentation and discussion will follow in due course. -#' # # Renumber before MaximizeParsimony, for `tree` #' @inheritParams TreeTools::Renumber #' @inheritParams MaximizeParsimony -#' @param weight Logical specifying whether to weight sites according to the -#' number of quartets they are decisive for. -#' -#' #' #' @references #' \insertAllCited{} @@ -59,11 +50,14 @@ #' data("congreveLamsdellMatrices", package = "TreeSearch") #' dataset <- congreveLamsdellMatrices[[1]][, 1:20] #' tree <- TreeSearch::referenceTree -#' qc <- QuartetConcordance(tree, dataset) +#' #' cc <- ClusteringConcordance(tree, dataset) +#' mcc <- MutualClusteringConcordance(tree, dataset) +#' +#' qc <- QuartetConcordance(tree, dataset) +#' #' pc <- PhylogeneticConcordance(tree, dataset) #' spc <- SharedPhylogeneticConcordance(tree, dataset) -#' mcc <- MutualClusteringConcordance(tree, dataset) #' #' oPar <- par(mar = rep(0, 4), cex = 0.8) # Set plotting parameters #' plot(tree) @@ -73,7 +67,7 @@ #' par(oPar) # Restore plotting parameters #' #' # Write concordance factors to file -#' labels <- paste0(qc, "/", cc, "/", pc) # "/" is a valid delimiter +#' labels <- paste0(cc, "/", qc, "/", pc) # "/" is a valid delimiter #' # Identify the node that corresponds to each label #' whichNode <- match(TreeTools::NTip(tree) + 1:tree$Nnode, names(qc)) #' @@ -83,14 +77,19 @@ #' ape::write.tree(tree) # or write.nexus(tree, file = "mytree.nex") #' #' # Display correlation between concordance factors -#' pairs(cbind(qc, cc, pc, spc, mcc), asp = 1) +#' pairs(cbind(cc, mcc, qc, pc, spc), asp = 1) #' @template MRS +#' @family split support functions +#' @name SiteConcordance +NULL + +#' @param weight Logical specifying whether to weight sites according to the +#' number of quartets they are decisive for. #' @importFrom ape keep.tip #' @importFrom cli cli_progress_bar cli_progress_update #' @importFrom utils combn #' @importFrom TreeTools as.Splits PhyDatToMatrix TipLabels -#' @name SiteConcordance -#' @family split support functions +#' @rdname SiteConcordance #' @export QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, return = "mean") { diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 75dbdbff1..21ce6386b 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -7,7 +7,7 @@ \alias{PhylogeneticConcordance} \alias{MutualClusteringConcordance} \alias{SharedPhylogeneticConcordance} -\title{Calculate site concordance factor} +\title{Concordance factors} \usage{ QuartetConcordance(tree, dataset = NULL, weight = TRUE, return = "mean") @@ -110,9 +110,9 @@ The attribute \code{weighted.mean} gives the mean value, weighted by the information content of each character. } \description{ -The site concordance factor \insertCite{Minh2020}{TreeSearch} is a measure -of the strength of support that the dataset presents for a given split in a -tree. +Concordance measures the strength of support that characters in a dataset +present for each split (=edge/branch) in a tree +\insertCite{Minh2020;SmithConc}{TreeSearch}. } \details{ \code{QuartetConcordance()} is the proportion of quartets (sets of four leaves) @@ -141,10 +141,6 @@ of quartets for a faster, if potentially less accurate, computation. Ambiguous and inapplicable tokens are treated as containing no grouping information (i.e. \code{(02)} or \code{-} are each treated as \verb{?}). -\strong{NOTE:} These functions are under development. They are incompletely -tested, and may change without notice. -Complete documentation and discussion will follow in due course. - \code{MutualClusteringConcordance()} treats each character as a tree. Each token in the character corresponds to a node in the tree whose pendant edges are the taxa with that token. @@ -162,11 +158,14 @@ this tree and \code{tree}. data("congreveLamsdellMatrices", package = "TreeSearch") dataset <- congreveLamsdellMatrices[[1]][, 1:20] tree <- TreeSearch::referenceTree -qc <- QuartetConcordance(tree, dataset) + cc <- ClusteringConcordance(tree, dataset) +mcc <- MutualClusteringConcordance(tree, dataset) + +qc <- QuartetConcordance(tree, dataset) + pc <- PhylogeneticConcordance(tree, dataset) spc <- SharedPhylogeneticConcordance(tree, dataset) -mcc <- MutualClusteringConcordance(tree, dataset) oPar <- par(mar = rep(0, 4), cex = 0.8) # Set plotting parameters plot(tree) @@ -176,7 +175,7 @@ TreeTools::LabelSplits(tree, signif(cc, 3), cex = 0.8) par(oPar) # Restore plotting parameters # Write concordance factors to file -labels <- paste0(qc, "/", cc, "/", pc) # "/" is a valid delimiter +labels <- paste0(cc, "/", qc, "/", pc) # "/" is a valid delimiter # Identify the node that corresponds to each label whichNode <- match(TreeTools::NTip(tree) + 1:tree$Nnode, names(qc)) @@ -186,7 +185,7 @@ tree$node.label <- labels[whichNode] ape::write.tree(tree) # or write.nexus(tree, file = "mytree.nex") # Display correlation between concordance factors -pairs(cbind(qc, cc, pc, spc, mcc), asp = 1) +pairs(cbind(cc, mcc, qc, pc, spc), asp = 1) data(congreveLamsdellMatrices) myMatrix <- congreveLamsdellMatrices[[10]] ClusteringConcordance(TreeTools::NJTree(myMatrix), myMatrix) From 8331b51ac88281d3c3377410ede9787c84b4587a Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 10:59:44 +0000 Subject: [PATCH 21/31] Refresh sequence --- R/Concordance.R | 295 +++++++++++++++++++++++------------------------- 1 file changed, 143 insertions(+), 152 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index a454999cb..a2f5d2cbc 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -30,15 +30,6 @@ #' Ambiguous and inapplicable tokens are treated as containing no grouping information #' (i.e. `(02)` or `-` are each treated as `?`). #' -# `ClusteringConcordance()` and `PhylogeneticConcordance()` respectively report -# the proportion of clustering information and phylogenetic information -# \insertCite{@as defined in @Vinh2010, @SmithDist}{TreeDist} within a dataset -# that is reflected in each split. -# These give smaller values because a split may be compatible with a character -# without being identical to it. -#' -#TODO Finally, `ProfileConcordance()` (to follow) -#' # # Renumber before MaximizeParsimony, for `tree` #' @inheritParams TreeTools::Renumber #' @inheritParams MaximizeParsimony @@ -83,113 +74,6 @@ #' @name SiteConcordance NULL -#' @param weight Logical specifying whether to weight sites according to the -#' number of quartets they are decisive for. -#' @importFrom ape keep.tip -#' @importFrom cli cli_progress_bar cli_progress_update -#' @importFrom utils combn -#' @importFrom TreeTools as.Splits PhyDatToMatrix TipLabels -#' @rdname SiteConcordance -#' @export -QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, - return = "mean") { - if (is.null(dataset)) { - warning("Cannot calculate concordance without `dataset`.") - return(NULL) - } - if (!inherits(dataset, "phyDat")) { - stop("`dataset` must be a phyDat object.") - } - tipLabels <- intersect(TipLabels(tree), names(dataset)) - if (!length(tipLabels)) { - warning("No overlap between tree labels and dataset.") - return(NULL) - } - dataset <- dataset[tipLabels, drop = FALSE] - splits <- as.Splits(tree, dataset) - logiSplits <- vapply(seq_along(splits), function (i) as.logical(splits[[i]]), - logical(NTip(dataset))) - - characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) - charLevels <- attr(dataset, "allLevels") - isAmbig <- rowSums(attr(dataset, "contrast")) > 1 - isInapp <- charLevels == "-" - nonGroupingLevels <- charLevels[isAmbig | isInapp] - characters[characters %in% nonGroupingLevels] <- NA - - charInt <- `mode<-`(characters, "integer") - raw_counts <- quartet_concordance(logiSplits, charInt) - - num <- raw_counts$concordant - den <- raw_counts$decisive - - options <- c("char", "default") - return <- options[[pmatch(tolower(trimws(return)), options, - nomatch = length(options))]] - - if (return == "default") { - if (isTRUE(weight)) { - # Sum numerator and denominator across sites (columns), then divide - # This matches weighted.mean(num/den, den) == sum(num) / sum(den) - split_sums_num <- rowSums(num) - split_sums_den <- rowSums(den) - ret <- ifelse(split_sums_den == 0, NA_real_, split_sums_num / split_sums_den) - } else { - # Mean of ratios per site - # Avoid division by zero (0/0 -> NaN -> NA handled by na.rm) - ratios <- num / den - # Replace NaN/Inf with NA for rowMeans calculation - ratios[!is.finite(ratios)] <- NA - ret <- rowMeans(ratios, na.rm = TRUE) - } - - setNames(ret, names(splits)) - } else { - p <- num / den - if (isTRUE(weight)) { - vapply(seq_len(dim(num)[[2]]), function(i) { - weighted.mean(num[, i] / den[, i], den[, i]) - }, double(1)) - } else { - vapply(seq_len(dim(num)[[2]]), function(i) { - mean(num[den[, i] > 0, i] / den[den[, i] > 0, i]) - }, double(1)) - } - } -} - -#' @importFrom fastmap fastmap -.ExpectedMICache <- fastmap() - -# @param a must be a vector of length <= 2 -# @param b may be longer -#' @importFrom base64enc base64encode -.ExpectedMI <- function(a, b) { - if (length(a) < 2 || length(b) < 2) { - 0 - } else { - key <- base64enc::base64encode(mi_key(a, b)) - if (.ExpectedMICache$has(key)) { - .ExpectedMICache$get(key) - } else { - ret <- expected_mi(a, b) - - # Cache: - .ExpectedMICache$set(key, ret) - # Return: - ret - } - } -} - -#' Re-zero a value by normalization -#' @param value value ranging from zero to one -#' @param zero new value to set as zero -#' @keywords internal -.Rezero <- function(value, zero) { - (value - zero) / (1 - zero) -} - #' @rdname SiteConcordance #' @param return Character specifying what to return. #' - `"mean"` returns the mean concordance index at each split across all sites. @@ -619,6 +503,149 @@ ConcordanceTable <- function(tree, dataset, Col = QACol, largeClade = 0, invisible(list(info = info, amount = amount, quality = quality, col = col)) } +#' @rdname SiteConcordance +#' @details +#' `MutualClusteringConcordance()` treats each character as a tree. +#' Each token in the character corresponds to a node in the tree whose pendant +#' edges are the taxa with that token. +#' The Mutual Clustering Concordance is then the Mutual Clustering Information +#' \insertCite{Smith2020}{TreeSearch} of the tree thus defined with `tree`. +#' +#' @return `MutualClusteringConcordance()` returns the mutual clustering +#' concordance of each character in `dataset` with `tree`. +#' The attribute `weighted.mean` gives the mean value, weighted by the +#' information content of each character. +#' @importFrom TreeTools MatchStrings +#' @importFrom TreeDist ClusteringEntropy MutualClusteringInfo +#' @export +MutualClusteringConcordance <- function (tree, dataset) { + if (is.null(dataset)) { + warning("Cannot calculate concordance without `dataset`.") + return(NULL) + } + + dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] + splits <- as.multiPhylo(as.Splits(tree)) + characters <- as.multiPhylo(dataset) + + support <- rowSums(vapply(characters, function (char) { + trimmed <- KeepTip(splits, TipLabels(char)) + cbind(mi = MutualClusteringInfo(char, trimmed), + possible = ClusteringEntropy(trimmed)) + }, matrix(NA_real_, length(splits), 2)), dims = 2) + + ret <- support[, 1] / support[, 2] + # Return: + structure(ret, weighted.mean = weighted.mean(ret, support[, 2])) +} + +#' @param weight Logical specifying whether to weight sites according to the +#' number of quartets they are decisive for. +#' @importFrom ape keep.tip +#' @importFrom cli cli_progress_bar cli_progress_update +#' @importFrom utils combn +#' @importFrom TreeTools as.Splits PhyDatToMatrix TipLabels +#' @rdname SiteConcordance +#' @export +QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, + return = "mean") { + if (is.null(dataset)) { + warning("Cannot calculate concordance without `dataset`.") + return(NULL) + } + if (!inherits(dataset, "phyDat")) { + stop("`dataset` must be a phyDat object.") + } + tipLabels <- intersect(TipLabels(tree), names(dataset)) + if (!length(tipLabels)) { + warning("No overlap between tree labels and dataset.") + return(NULL) + } + dataset <- dataset[tipLabels, drop = FALSE] + splits <- as.Splits(tree, dataset) + logiSplits <- vapply(seq_along(splits), function (i) as.logical(splits[[i]]), + logical(NTip(dataset))) + + characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) + charLevels <- attr(dataset, "allLevels") + isAmbig <- rowSums(attr(dataset, "contrast")) > 1 + isInapp <- charLevels == "-" + nonGroupingLevels <- charLevels[isAmbig | isInapp] + characters[characters %in% nonGroupingLevels] <- NA + + charInt <- `mode<-`(characters, "integer") + raw_counts <- quartet_concordance(logiSplits, charInt) + + num <- raw_counts$concordant + den <- raw_counts$decisive + + options <- c("char", "default") + return <- options[[pmatch(tolower(trimws(return)), options, + nomatch = length(options))]] + + if (return == "default") { + if (isTRUE(weight)) { + # Sum numerator and denominator across sites (columns), then divide + # This matches weighted.mean(num/den, den) == sum(num) / sum(den) + split_sums_num <- rowSums(num) + split_sums_den <- rowSums(den) + ret <- ifelse(split_sums_den == 0, NA_real_, split_sums_num / split_sums_den) + } else { + # Mean of ratios per site + # Avoid division by zero (0/0 -> NaN -> NA handled by na.rm) + ratios <- num / den + # Replace NaN/Inf with NA for rowMeans calculation + ratios[!is.finite(ratios)] <- NA + ret <- rowMeans(ratios, na.rm = TRUE) + } + + setNames(ret, names(splits)) + } else { + p <- num / den + if (isTRUE(weight)) { + vapply(seq_len(dim(num)[[2]]), function(i) { + weighted.mean(num[, i] / den[, i], den[, i]) + }, double(1)) + } else { + vapply(seq_len(dim(num)[[2]]), function(i) { + mean(num[den[, i] > 0, i] / den[den[, i] > 0, i]) + }, double(1)) + } + } +} + +#' @importFrom fastmap fastmap +.ExpectedMICache <- fastmap() + +# @param a must be a vector of length <= 2 +# @param b may be longer +#' @importFrom base64enc base64encode +.ExpectedMI <- function(a, b) { + if (length(a) < 2 || length(b) < 2) { + 0 + } else { + key <- base64enc::base64encode(mi_key(a, b)) + if (.ExpectedMICache$has(key)) { + .ExpectedMICache$get(key) + } else { + ret <- expected_mi(a, b) + + # Cache: + .ExpectedMICache$set(key, ret) + # Return: + ret + } + } +} + +#' Re-zero a value by normalization +#' @param value value ranging from zero to one +#' @param zero new value to set as zero +#' @keywords internal +.Rezero <- function(value, zero) { + (value - zero) / (1 - zero) +} + #' @rdname SiteConcordance #' @importFrom TreeTools as.multiPhylo CladisticInfo CompatibleSplits #' MatchStrings @@ -658,42 +685,6 @@ PhylogeneticConcordance <- function (tree, dataset) { support[, 1] / support[, 2] } -#' @rdname SiteConcordance -#' @details -#' `MutualClusteringConcordance()` treats each character as a tree. -#' Each token in the character corresponds to a node in the tree whose pendant -#' edges are the taxa with that token. -#' The Mutual Clustering Concordance is then the Mutual Clustering Information -#' \insertCite{Smith2020}{TreeSearch} of the tree thus defined with `tree`. -#' -#' @return `MutualClusteringConcordance()` returns the mutual clustering -#' concordance of each character in `dataset` with `tree`. -#' The attribute `weighted.mean` gives the mean value, weighted by the -#' information content of each character. -#' @importFrom TreeTools MatchStrings -#' @importFrom TreeDist ClusteringEntropy MutualClusteringInfo -#' @export -MutualClusteringConcordance <- function (tree, dataset) { - if (is.null(dataset)) { - warning("Cannot calculate concordance without `dataset`.") - return(NULL) - } - - dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] - splits <- as.multiPhylo(as.Splits(tree)) - characters <- as.multiPhylo(dataset) - - support <- rowSums(vapply(characters, function (char) { - trimmed <- KeepTip(splits, TipLabels(char)) - cbind(mi = MutualClusteringInfo(char, trimmed), - possible = ClusteringEntropy(trimmed)) - }, matrix(NA_real_, length(splits), 2)), dims = 2) - - ret <- support[, 1] / support[, 2] - # Return: - structure(ret, weighted.mean = weighted.mean(ret, support[, 2])) -} - #' @rdname SiteConcordance #' @details #' `SharedPhylogeneticConcordance()` treats each character as a simple tree. From a4601e97c61bf24a14535e74b94f561d11fffc92 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 11:11:15 +0000 Subject: [PATCH 22/31] Doc w/ fns --- R/Concordance.R | 96 ++++++++++++++++++++++++------------------ man/SiteConcordance.Rd | 56 ++++++++++++------------ 2 files changed, 84 insertions(+), 68 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index a2f5d2cbc..45e30da93 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -4,32 +4,6 @@ #' present for each split (=edge/branch) in a tree #' \insertCite{Minh2020;SmithConc}{TreeSearch}. #' -#' `QuartetConcordance()` is the proportion of quartets (sets of four leaves) -#' that are decisive for a split which are also concordant with it. -#' For example, a quartet with the characters `0 0 0 1` is not decisive, as -#' all relationships between those leaves are equally parsimonious. -#' But a quartet with characters `0 0 1 1` is decisive, and is concordant -#' with any tree that groups the first two leaves together to the exclusion -#' of the second. -#' -#' By default, the reported value weights each site by the number of quartets -#' it is decisive for. This value can be interpreted as the proportion of -#' all decisive quartets that are concordant with a split. -#' If `weight = FALSE`, the reported value is the mean of the concordance -#' value for each site. -#' Consider a split associated with two sites: -#' one that is concordant with 25% of 96 decisive quartets, and -#' a second that is concordant with 75% of 4 decisive quartets. -#' If `weight = TRUE`, the split concordance will be 24 + 3 / 96 + 4 = 27%. -#' If `weight = FALSE`, the split concordance will be mean(75%, 25%) = 50%. -#' -#' `QuartetConcordance()` is computed exactly, using all quartets, where as -#' other implementations (e.g. IQ-TREE) follow -#' \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample -#' of quartets for a faster, if potentially less accurate, computation. -#' Ambiguous and inapplicable tokens are treated as containing no grouping information -#' (i.e. `(02)` or `-` are each treated as `?`). -#' # # Renumber before MaximizeParsimony, for `tree` #' @inheritParams TreeTools::Renumber #' @inheritParams MaximizeParsimony @@ -75,6 +49,17 @@ NULL #' @rdname SiteConcordance +#' @details +#' `ClusteringConcordance()` measures how well each split reflects the +#' grouping information present in each character +#' \insertCite{SmithConc}{TreeSearch}. It treats characters and +#' splits as clusterings of taxa, quantifying their agreement using normalized +#' mutual information. Values range from 1 (perfect agreement) to 0 (no more +#' agreement than expected by chance); negative values indicate less agreement +#' than expected. Summaries returned by `return = "edge"` or `"char"` report, +#' respectively, how well each split is supported by all characters, or how +#' well each character is reflected across all splits. +#' #' @param return Character specifying what to return. #' - `"mean"` returns the mean concordance index at each split across all sites. #' - `"all"` returns all values calculated during the working for each site at @@ -129,16 +114,15 @@ NULL #' `ClusteringConcordance(return = "tree")` returns a single value representing #' the overall concordance between the tree topology and the character data. #' This averages the fit of the best-matching split for each character. -#' This is probably biased. I have not identified a circumstance in which it -#' produces meaningful results. Let me know if you find one. -#' -#' I had previously considered calculating -#' the entropy-weighted average of normalized mutual -#' information across all split-character pairs, where each pair contributes -#' proportionally to its potential information content. -#' The problem here is that imperfect matches between compatible splits -#' come to dominate, resulting in a small score that gets smaller as trees get -#' larger, even with a perfect fit. +#' This is included for completeness, though it is not clear that this is a useful +#' or meaningful measure. +# +# I had previously considered calculating the entropy-weighted average of normalized +# mutual information across all split-character pairs, where each pair contributes +# proportionally to its potential information content. +# The problem here is that imperfect matches between compatible splits +# come to dominate, resulting in a small score that gets smaller as trees get +# larger, even with a perfect fit. #' #' #' @seealso @@ -505,11 +489,12 @@ ConcordanceTable <- function(tree, dataset, Col = QACol, largeClade = 0, #' @rdname SiteConcordance #' @details -#' `MutualClusteringConcordance()` treats each character as a tree. -#' Each token in the character corresponds to a node in the tree whose pendant -#' edges are the taxa with that token. -#' The Mutual Clustering Concordance is then the Mutual Clustering Information -#' \insertCite{Smith2020}{TreeSearch} of the tree thus defined with `tree`. +#' `MutualClusteringConcordance()` provides a character‑wise summary that +#' emphasises each character’s best‑matching split(s). It treats each character +#' as a simple tree and computes the mutual clustering information between this +#' character‑tree and the supplied phylogeny. High values identify characters +#' whose signal is well represented anywhere in the tree, even if concentrated +#' on a single edge. #' #' @return `MutualClusteringConcordance()` returns the mutual clustering #' concordance of each character in `dataset` with `tree`. @@ -539,13 +524,40 @@ MutualClusteringConcordance <- function (tree, dataset) { structure(ret, weighted.mean = weighted.mean(ret, support[, 2])) } +#' @rdname SiteConcordance +#' @details +#' `QuartetConcordance()` is the proportion of quartets (sets of four leaves) +#' that are decisive for a split which are also concordant with it +#' (the site concordance factor \insertCite{Minh2020}{TreeSearch}). +#' For example, a quartet with the characters `0 0 0 1` is not decisive, as +#' all relationships between those leaves are equally parsimonious. +#' But a quartet with characters `0 0 1 1` is decisive, and is concordant +#' with any tree that groups the first two leaves together to the exclusion +#' of the second. +#' +#' By default, the reported value weights each site by the number of quartets +#' it is decisive for. This value can be interpreted as the proportion of +#' all decisive quartets that are concordant with a split. +#' If `weight = FALSE`, the reported value is the mean of the concordance +#' value for each site. +#' Consider a split associated with two sites: +#' one that is concordant with 25% of 96 decisive quartets, and +#' a second that is concordant with 75% of 4 decisive quartets. +#' If `weight = TRUE`, the split concordance will be 24 + 3 / 96 + 4 = 27%. +#' If `weight = FALSE`, the split concordance will be mean(75%, 25%) = 50%. +#' +#' `QuartetConcordance()` is computed exactly, using all quartets, where as +#' other implementations (e.g. IQ-TREE) follow +#' \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample +#' of quartets for a faster, if potentially less accurate, computation. +#' Ambiguous and inapplicable tokens are treated as containing no grouping information +#' (i.e. `(02)` or `-` are each treated as `?`). #' @param weight Logical specifying whether to weight sites according to the #' number of quartets they are decisive for. #' @importFrom ape keep.tip #' @importFrom cli cli_progress_bar cli_progress_update #' @importFrom utils combn #' @importFrom TreeTools as.Splits PhyDatToMatrix TipLabels -#' @rdname SiteConcordance #' @export QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, return = "mean") { diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 21ce6386b..c86bdf2f6 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -2,21 +2,21 @@ % Please edit documentation in R/Concordance.R \name{SiteConcordance} \alias{SiteConcordance} -\alias{QuartetConcordance} \alias{ClusteringConcordance} -\alias{PhylogeneticConcordance} \alias{MutualClusteringConcordance} +\alias{QuartetConcordance} +\alias{PhylogeneticConcordance} \alias{SharedPhylogeneticConcordance} \title{Concordance factors} \usage{ -QuartetConcordance(tree, dataset = NULL, weight = TRUE, return = "mean") - ClusteringConcordance(tree, dataset, return = "edge", normalize = TRUE) -PhylogeneticConcordance(tree, dataset) - MutualClusteringConcordance(tree, dataset) +QuartetConcordance(tree, dataset = NULL, weight = TRUE, return = "mean") + +PhylogeneticConcordance(tree, dataset) + SharedPhylogeneticConcordance(tree, dataset) } \arguments{ @@ -28,9 +28,6 @@ Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}()}. Additive (ordered) characters can be handled using \code{\link[TreeTools]{Decompose}()}.} -\item{weight}{Logical specifying whether to weight sites according to the -number of quartets they are decisive for.} - \item{return}{Character specifying what to return. \itemize{ \item \code{"mean"} returns the mean concordance index at each split across all sites. @@ -55,6 +52,9 @@ corresponds to the maximum possible value, and 0 to the expected value. If \code{normalize = TRUE}, this will be the expected value for a random character on the given tree. If \code{normalize} is numeric, the expected value will be estimated by fitting the character to \code{n} uniformly random trees.} + +\item{weight}{Logical specifying whether to weight sites according to the +number of quartets they are decisive for.} } \value{ \code{ClusteringConcordance(return = "all")} returns a 3D array where each @@ -88,16 +88,8 @@ from splits that can potentially convey more information about them. \code{ClusteringConcordance(return = "tree")} returns a single value representing the overall concordance between the tree topology and the character data. This averages the fit of the best-matching split for each character. -This is probably biased. I have not identified a circumstance in which it -produces meaningful results. Let me know if you find one. - -I had previously considered calculating -the entropy-weighted average of normalized mutual -information across all split-character pairs, where each pair contributes -proportionally to its potential information content. -The problem here is that imperfect matches between compatible splits -come to dominate, resulting in a small score that gets smaller as trees get -larger, even with a perfect fit. +This is included for completeness, though it is not clear that this is a useful +or meaningful measure. \code{MutualClusteringConcordance()} returns the mutual clustering concordance of each character in \code{dataset} with \code{tree}. @@ -115,8 +107,26 @@ present for each split (=edge/branch) in a tree \insertCite{Minh2020;SmithConc}{TreeSearch}. } \details{ +\code{ClusteringConcordance()} measures how well each split reflects the +grouping information present in each character +\insertCite{SmithConc}{TreeSearch}. It treats characters and +splits as clusterings of taxa, quantifying their agreement using normalized +mutual information. Values range from 1 (perfect agreement) to 0 (no more +agreement than expected by chance); negative values indicate less agreement +than expected. Summaries returned by \code{return = "edge"} or \code{"char"} report, +respectively, how well each split is supported by all characters, or how +well each character is reflected across all splits. + +\code{MutualClusteringConcordance()} provides a character‑wise summary that +emphasises each character’s best‑matching split(s). It treats each character +as a simple tree and computes the mutual clustering information between this +character‑tree and the supplied phylogeny. High values identify characters +whose signal is well represented anywhere in the tree, even if concentrated +on a single edge. + \code{QuartetConcordance()} is the proportion of quartets (sets of four leaves) -that are decisive for a split which are also concordant with it. +that are decisive for a split which are also concordant with it +(the site concordance factor \insertCite{Minh2020}{TreeSearch}). For example, a quartet with the characters \verb{0 0 0 1} is not decisive, as all relationships between those leaves are equally parsimonious. But a quartet with characters \verb{0 0 1 1} is decisive, and is concordant @@ -141,12 +151,6 @@ of quartets for a faster, if potentially less accurate, computation. Ambiguous and inapplicable tokens are treated as containing no grouping information (i.e. \code{(02)} or \code{-} are each treated as \verb{?}). -\code{MutualClusteringConcordance()} treats each character as a tree. -Each token in the character corresponds to a node in the tree whose pendant -edges are the taxa with that token. -The Mutual Clustering Concordance is then the Mutual Clustering Information -\insertCite{Smith2020}{TreeSearch} of the tree thus defined with \code{tree}. - \code{SharedPhylogeneticConcordance()} treats each character as a simple tree. Each token in the character corresponds to a node whose pendant edges are the taxa with that token. From 52b6fb2681b79768db53d385cce4fb6f8fd81d23 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 11:29:41 +0000 Subject: [PATCH 23/31] Quartet return --- R/Concordance.R | 44 +++++++++++++++++++++++++----------------- man/SiteConcordance.Rd | 21 +++++++++++--------- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 45e30da93..61ccb11cb 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -410,10 +410,10 @@ QALegend <- function(where = c(0.1, 0.3, 0.1, 0.3), n = 5, Col = QACol) { } #' Plot concordance table -#' +#' #' `ConcordanceTable()` plots a concordance table #' \insertCite{SmithConc}{TreeSearch}. -#' +#' #' @inheritParams ClusteringConcordance #' @param Col Function that takes vectors `amount` and `quality` and returns #' a vector of colours. [QCol] colours by data quality (concordance); @@ -429,14 +429,14 @@ QALegend <- function(where = c(0.1, 0.3, 0.1, 0.3), n = 5, Col = QACol) { #' - `"relInfo"`: The information, normalized to the most information-rich pair; #' - `"quality"`: The normalized mutual information of the pair; #' - `"col"`: The colours used to plot the table. -#' -#' @references \insertAllCited{} +#' +#' @references \insertAllCited{} #' @examples #' # Load data and tree #' data("congreveLamsdellMatrices", package = "TreeSearch") #' dataset <- congreveLamsdellMatrices[[1]][, 1:20] #' tree <- referenceTree -#' +#' #' # Plot tree and identify nodes #' library("TreeTools", quietly = TRUE) #' plot(tree) @@ -444,12 +444,12 @@ QALegend <- function(where = c(0.1, 0.3, 0.1, 0.3), n = 5, Col = QACol) { #' nodelabels(seq_along(nodeIndex), nodeIndex, adj = c(2, 1), #' frame = "none", bg = NULL) #' QALegend(where = c(0.1, 0.4, 0.1, 0.3)) -#' +#' #' # View information shared by characters and edges #' ConcordanceTable(tree, dataset, largeClade = 3, col = 2, lwd = 3) #' axis(1) #' axis(2) -#' +#' #' # Visualize dataset #' image(t(`mode<-`(PhyDatToMatrix(dataset), "numeric")), axes = FALSE, #' xlab = "Leaf", ylab = "Character") @@ -467,7 +467,7 @@ ConcordanceTable <- function(tree, dataset, Col = QACol, largeClade = 0, # Plot points with incalculable quality as black, not transparent. amount[is.na(quality)] <- 0 quality[is.na(quality)] <- 0 - + col <- matrix(Col(amount, quality), dim(amount)[[1]], dim(amount)[[2]]) image(nodes, seq_len(dim(cc)[[3]]), matrix(1:prod(dim(amount)), dim(amount)[[1]]), @@ -495,7 +495,7 @@ ConcordanceTable <- function(tree, dataset, Col = QACol, largeClade = 0, #' character‑tree and the supplied phylogeny. High values identify characters #' whose signal is well represented anywhere in the tree, even if concentrated #' on a single edge. -#' +#' #' @return `MutualClusteringConcordance()` returns the mutual clustering #' concordance of each character in `dataset` with `tree`. #' The attribute `weighted.mean` gives the mean value, weighted by the @@ -508,7 +508,7 @@ MutualClusteringConcordance <- function (tree, dataset) { warning("Cannot calculate concordance without `dataset`.") return(NULL) } - + dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] splits <- as.multiPhylo(as.Splits(tree)) characters <- as.multiPhylo(dataset) @@ -529,29 +529,37 @@ MutualClusteringConcordance <- function (tree, dataset) { #' `QuartetConcordance()` is the proportion of quartets (sets of four leaves) #' that are decisive for a split which are also concordant with it #' (the site concordance factor \insertCite{Minh2020}{TreeSearch}). -#' For example, a quartet with the characters `0 0 0 1` is not decisive, as +#' For example, a quartet with the characters `0 0 0 1` is not decisive, as #' all relationships between those leaves are equally parsimonious. #' But a quartet with characters `0 0 1 1` is decisive, and is concordant #' with any tree that groups the first two leaves together to the exclusion #' of the second. -#' +#' #' By default, the reported value weights each site by the number of quartets #' it is decisive for. This value can be interpreted as the proportion of #' all decisive quartets that are concordant with a split. #' If `weight = FALSE`, the reported value is the mean of the concordance -#' value for each site. +#' value for each site. #' Consider a split associated with two sites: #' one that is concordant with 25% of 96 decisive quartets, and #' a second that is concordant with 75% of 4 decisive quartets. #' If `weight = TRUE`, the split concordance will be 24 + 3 / 96 + 4 = 27%. #' If `weight = FALSE`, the split concordance will be mean(75%, 25%) = 50%. -#' +#' #' `QuartetConcordance()` is computed exactly, using all quartets, where as #' other implementations (e.g. IQ-TREE) follow #' \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample #' of quartets for a faster, if potentially less accurate, computation. -#' Ambiguous and inapplicable tokens are treated as containing no grouping information -#' (i.e. `(02)` or `-` are each treated as `?`). +#' Ambiguous and inapplicable tokens are treated as containing no grouping +#' information (i.e. `(02)` or `-` are each treated as `?`). +#' @return +#' `QuartetConcordance(return = "edge")` returns a numeric vector giving the +#' concordance index at each split across all sites; names specify the number of +#' each corresponding split in `tree`. +#' +#' `QuartetConcordance(return = "char")` returns a numeric vector giving the +#' concordance index calculated at each site, averaged across all splits. +#' #' @param weight Logical specifying whether to weight sites according to the #' number of quartets they are decisive for. #' @importFrom ape keep.tip @@ -584,10 +592,10 @@ QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, isInapp <- charLevels == "-" nonGroupingLevels <- charLevels[isAmbig | isInapp] characters[characters %in% nonGroupingLevels] <- NA - + charInt <- `mode<-`(characters, "integer") raw_counts <- quartet_concordance(logiSplits, charInt) - + num <- raw_counts$concordant den <- raw_counts$decisive diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index c86bdf2f6..c1a253eff 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -13,7 +13,7 @@ ClusteringConcordance(tree, dataset, return = "edge", normalize = TRUE) MutualClusteringConcordance(tree, dataset) -QuartetConcordance(tree, dataset = NULL, weight = TRUE, return = "mean") +QuartetConcordance(tree, dataset = NULL, weight = TRUE, return = "edge") PhylogeneticConcordance(tree, dataset) @@ -28,12 +28,8 @@ Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}()}. Additive (ordered) characters can be handled using \code{\link[TreeTools]{Decompose}()}.} -\item{return}{Character specifying what to return. -\itemize{ -\item \code{"mean"} returns the mean concordance index at each split across all sites. -\item \code{"all"} returns all values calculated during the working for each site at -each split. -}} +\item{return}{Character specifying whether to summarize support per +character (\code{"char"}) or per edge (\code{"edge"}). See below for details.} \item{normalize}{Logical or numeric; if \code{TRUE} the mutual information will be normalized such that zero corresponds to the expected mutual information of @@ -96,6 +92,13 @@ concordance of each character in \code{dataset} with \code{tree}. The attribute \code{weighted.mean} gives the mean value, weighted by the information content of each character. +\code{QuartetConcordance(return = "edge")} returns a numeric vector giving the +concordance index at each split across all sites; names specify the number of +each corresponding split in \code{tree}. + +\code{QuartetConcordance(return = "char")} returns a numeric vector giving the +concordance index calculated at each site, averaged across all splits. + \code{SharedPhylogeneticConcordance()} returns the shared phylogenetic concordance of each character in \code{dataset} with \code{tree}. The attribute \code{weighted.mean} gives the mean value, weighted by the @@ -148,8 +151,8 @@ If \code{weight = FALSE}, the split concordance will be mean(75\%, 25\%) = 50\%. other implementations (e.g. IQ-TREE) follow \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample of quartets for a faster, if potentially less accurate, computation. -Ambiguous and inapplicable tokens are treated as containing no grouping information -(i.e. \code{(02)} or \code{-} are each treated as \verb{?}). +Ambiguous and inapplicable tokens are treated as containing no grouping +information (i.e. \code{(02)} or \code{-} are each treated as \verb{?}). \code{SharedPhylogeneticConcordance()} treats each character as a simple tree. Each token in the character corresponds to a node whose pendant edges are the From eafe445d0776ca9f9443a4c1c2d5d54699efa493 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 11:31:10 +0000 Subject: [PATCH 24/31] return --- R/Concordance.R | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 61ccb11cb..5b34c1f29 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -598,18 +598,22 @@ QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, num <- raw_counts$concordant den <- raw_counts$decisive - - options <- c("char", "default") return <- options[[pmatch(tolower(trimws(return)), options, nomatch = length(options))]] + + options <- c("character", "site", "default") if (return == "default") { if (isTRUE(weight)) { # Sum numerator and denominator across sites (columns), then divide # This matches weighted.mean(num/den, den) == sum(num) / sum(den) split_sums_num <- rowSums(num) split_sums_den <- rowSums(den) - ret <- ifelse(split_sums_den == 0, NA_real_, split_sums_num / split_sums_den) + ret <- ifelse( + split_sums_den == 0, + NA_real_, + split_sums_num / split_sums_den + ) } else { # Mean of ratios per site # Avoid division by zero (0/0 -> NaN -> NA handled by na.rm) @@ -618,18 +622,25 @@ QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, ratios[!is.finite(ratios)] <- NA ret <- rowMeans(ratios, na.rm = TRUE) } - + setNames(ret, names(splits)) } else { + # return = "char" p <- num / den if (isTRUE(weight)) { vapply(seq_len(dim(num)[[2]]), function(i) { weighted.mean(num[, i] / den[, i], den[, i]) }, double(1)) + function(i) { + weighted.mean(num[, i] / den[, i], den[, i]) } else { vapply(seq_len(dim(num)[[2]]), function(i) { mean(num[den[, i] > 0, i] / den[den[, i] > 0, i]) }, double(1)) + vapply( + seq_len(dim(num)[[2]]), + }, + double(1) } } } From 06db1bc0154f5beb42501496d8d277d99b7784d6 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 11:38:06 +0000 Subject: [PATCH 25/31] Format |-: --- R/Concordance.R | 186 +++++++++++++++++++++++++----------------------- 1 file changed, 96 insertions(+), 90 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 5b34c1f29..ca6510d02 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -1,46 +1,46 @@ #' Concordance factors -#' +#' #' Concordance measures the strength of support that characters in a dataset #' present for each split (=edge/branch) in a tree #' \insertCite{Minh2020;SmithConc}{TreeSearch}. -#' +#' # # Renumber before MaximizeParsimony, for `tree` #' @inheritParams TreeTools::Renumber #' @inheritParams MaximizeParsimony -#' -#' @references +#' +#' @references #' \insertAllCited{} -#' -#' @examples +#' +#' @examples #' data("congreveLamsdellMatrices", package = "TreeSearch") #' dataset <- congreveLamsdellMatrices[[1]][, 1:20] #' tree <- TreeSearch::referenceTree -#' +#' #' cc <- ClusteringConcordance(tree, dataset) #' mcc <- MutualClusteringConcordance(tree, dataset) -#' +#' #' qc <- QuartetConcordance(tree, dataset) -#' +#' #' pc <- PhylogeneticConcordance(tree, dataset) #' spc <- SharedPhylogeneticConcordance(tree, dataset) -#' +#' #' oPar <- par(mar = rep(0, 4), cex = 0.8) # Set plotting parameters #' plot(tree) #' TreeTools::LabelSplits(tree, signif(qc, 3), cex = 0.8) #' plot(tree) #' TreeTools::LabelSplits(tree, signif(cc, 3), cex = 0.8) #' par(oPar) # Restore plotting parameters -#' +#' #' # Write concordance factors to file #' labels <- paste0(cc, "/", qc, "/", pc) # "/" is a valid delimiter #' # Identify the node that corresponds to each label #' whichNode <- match(TreeTools::NTip(tree) + 1:tree$Nnode, names(qc)) -#' +#' #' # The contents of tree$node.label will be written at each node #' tree$node.label <- labels[whichNode] -#' +#' #' ape::write.tree(tree) # or write.nexus(tree, file = "mytree.nex") -#' +#' #' # Display correlation between concordance factors #' pairs(cbind(cc, mcc, qc, pc, spc), asp = 1) #' @template MRS @@ -59,15 +59,13 @@ NULL #' than expected. Summaries returned by `return = "edge"` or `"char"` report, #' respectively, how well each split is supported by all characters, or how #' well each character is reflected across all splits. -#' -#' @param return Character specifying what to return. -#' - `"mean"` returns the mean concordance index at each split across all sites. -#' - `"all"` returns all values calculated during the working for each site at -#' each split. +#' +#' @param return Character specifying whether to summarize support per +#' character (`"char"`) or per edge (`"edge"`). See below for details. #' @param normalize Logical or numeric; if `TRUE` the mutual information will be #' normalized such that zero corresponds to the expected mutual information of #' a randomly drawn character with the same distribution of tokens. -#' If `FALSE`, zero will correspond to zero mutual information, +#' If `FALSE`, zero will correspond to zero mutual information, #' even if this is not achievable in practice. #' The exact analytical solution, though fast, does not account for #' non-independence between splits. This limitation is minor for larger @@ -75,56 +73,56 @@ NULL #' For smaller trees, the expected value for random trees can be approximated #' by resampling relabelled trees. Setting `normalize = n` will approximate the #' expected value based on _n_ samples. -#' +#' #' For `return = "char"`, `"tree"`, values will be normalized such that 1 #' corresponds to the maximum possible value, and 0 to the expected value. #' If `normalize = TRUE`, this will be the expected value for a random #' character on the given tree. If `normalize` is numeric, the expected value #' will be estimated by fitting the character to `n` uniformly random trees. -#' -#' @returns +#' +#' @returns #' `ClusteringConcordance(return = "all")` returns a 3D array where each -#' slice corresponds to a character (site), each column to a tree split, and -#' each row to a different information measure. The `normalized` row gives the -#' normalized mutual information between each split-character pair, scaled so -#' that 1.0 corresponds to `hBest` (the theoretical maximum mutual information, -#' being the minimum of `hSplit` and `hChar`) and 0.0 corresponds to `miRand` -#' (the expected mutual information under random association). `hSplit` gives -#' the entropy (information content) of each split's bipartition; `hChar` gives -#' the entropy of each character's state distribution; `hJoint` gives the joint -#' entropy of the split-character confusion matrix; `mi` gives the raw mutual -#' information; and `n` records the number of informative observations. -#' Negative normalized values indicate observed mutual information below random +#' slice corresponds to a character (site), each column to a tree split, and +#' each row to a different information measure. The `normalized` row gives the +#' normalized mutual information between each split-character pair, scaled so +#' that 1.0 corresponds to `hBest` (the theoretical maximum mutual information, +#' being the minimum of `hSplit` and `hChar`) and 0.0 corresponds to `miRand` +#' (the expected mutual information under random association). `hSplit` gives +#' the entropy (information content) of each split's bipartition; `hChar` gives +#' the entropy of each character's state distribution; `hJoint` gives the joint +#' entropy of the split-character confusion matrix; `mi` gives the raw mutual +#' information; and `n` records the number of informative observations. +#' Negative normalized values indicate observed mutual information below random #' expectation. `NA` is returned when `hBest = 0` (no information potential). -#' -#' `ClusteringConcordance(return = "edge")` returns a vector where each element +#' +#' `ClusteringConcordance(return = "edge")` returns a vector where each element #' corresponds to a split (an edge of the tree) and gives the normalized mutual #' information between that split and the character data, averaged across all #' characters. -#' When `normalize = TRUE` (default), values are scaled relative to random -#' expectation; when `FALSE`, raw mutual information normalized by `hBest` is +#' When `normalize = TRUE` (default), values are scaled relative to random +#' expectation; when `FALSE`, raw mutual information normalized by `hBest` is #' returned. -#' -#' `ClusteringConcordance(return = "char")` returns a vector where each element -#' corresponds to a character (site) and gives the entropy-weighted average -#' normalized mutual information between that character and all tree splits. -#' Characters with higher information content receive proportionally more weight +#' +#' `ClusteringConcordance(return = "char")` returns a vector where each element +#' corresponds to a character (site) and gives the entropy-weighted average +#' normalized mutual information between that character and all tree splits. +#' Characters with higher information content receive proportionally more weight #' from splits that can potentially convey more information about them. #' -#' `ClusteringConcordance(return = "tree")` returns a single value representing +#' `ClusteringConcordance(return = "tree")` returns a single value representing #' the overall concordance between the tree topology and the character data. #' This averages the fit of the best-matching split for each character. #' This is included for completeness, though it is not clear that this is a useful #' or meaningful measure. -# +# # I had previously considered calculating the entropy-weighted average of normalized -# mutual information across all split-character pairs, where each pair contributes +# mutual information across all split-character pairs, where each pair contributes # proportionally to its potential information content. # The problem here is that imperfect matches between compatible splits # come to dominate, resulting in a small score that gets smaller as trees get # larger, even with a perfect fit. -#' -#' +#' +#' #' @seealso #' - [Consistency()] #' @examples @@ -138,8 +136,12 @@ NULL #' MutualClusteringInfo #' @importFrom TreeTools as.Splits MatchStrings Subsplit TipLabels #' @export -ClusteringConcordance <- function (tree, dataset, return = "edge", - normalize = TRUE) { +ClusteringConcordance <- function( + tree, + dataset, + return = "edge", + normalize = TRUE +) { # Check inputs if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") @@ -149,28 +151,28 @@ ClusteringConcordance <- function (tree, dataset, return = "edge", warning("Cannot calculate concordance without `dataset`.") return(NULL) } - + keep <- MatchStrings(TipLabels(tree), names(dataset), warning) if (length(keep) == 0) { return(NULL) } dataset <- dataset[keep] - + # Prepare data splits <- as.logical(as.Splits(tree)) - + at <- attributes(dataset) cont <- at[["contrast"]] if ("-" %in% colnames(cont)) { cont[cont[, "-"] > 0, ] <- 1 } ambiguous <- rowSums(cont) != 1 - + mat <- matrix(as.integer(unlist(dataset)), length(dataset), byrow = TRUE) mat[mat %in% which(ambiguous)] <- NA_integer_ maxToken <- max(mat, na.rm = TRUE) tokens <- as.character(seq_len(maxToken)) - mat <- apply(mat, 2, function (x) { + mat <- apply(mat, 2, function(x) { uniques <- tabulate(x, maxToken) == 1 x[x %in% tokens[uniques]] <- NA_integer_ x @@ -213,7 +215,7 @@ ClusteringConcordance <- function (tree, dataset, return = "edge", # Matrix to 3D array dim(h) <- c(dim(h), 1) } - + h[abs(h) < sqrt(.Machine$double.eps)] <- 0 hh <- h[, , at[["index"]], drop = FALSE] @@ -223,11 +225,11 @@ ClusteringConcordance <- function (tree, dataset, return = "edge", hh["hSplit", , , drop = FALSE] - hh["hJoint", , , drop = FALSE], NULL) miRand <- `rownames<-`(hh["miRand", , , drop = FALSE], NULL) - norm <- if(isFALSE(normalize)) { - ifelse(hBest == 0, NA, mi / hBest) - } else { - ifelse(hBest == 0, NA, .Rezero(mi / hBest, miRand / hBest)) - } + norm <- if (isFALSE(normalize)) { + ifelse(hBest == 0, NA, mi / hBest) + } else { + ifelse(hBest == 0, NA, .Rezero(mi / hBest, miRand / hBest)) + } returnType <- pmatch(tolower(return), c("all", "edge", "character", "tree"), nomatch = 1L) @@ -249,7 +251,7 @@ ClusteringConcordance <- function (tree, dataset, return = "edge", mcseTree <- sqrt(treeVar / normalize) } } - + # Return: switch(returnType, # all @@ -328,7 +330,7 @@ ClusteringConcordance <- function (tree, dataset, return = "edge", #' Generate colour to depict the amount and quality of observations #' @param amount Numeric vector of values between 0 and 1, denoting the relative #' amount of information -#' @param quality Numeric vector of values between -1 and 1, denoting the +#' @param quality Numeric vector of values between -1 and 1, denoting the #' quality of observations, where 0 is neutral. #' @return `QACol()` returns an RGB hex code for a colour, where lighter colours #' correspond to entries with a higher `amount`; unsaturated colours denote @@ -389,7 +391,7 @@ QCol <- function(amount, quality) { #' @rdname QACol #' @param where Location of legend, passed to `par(fig = where)` -#' @param n Integer vector giving number of cells to plot in swatch for +#' @param n Integer vector giving number of cells to plot in swatch for #' `quality` and `amount`. #' @inheritParams ConcordanceTable #' @export @@ -503,7 +505,7 @@ ConcordanceTable <- function(tree, dataset, Col = QACol, largeClade = 0, #' @importFrom TreeTools MatchStrings #' @importFrom TreeDist ClusteringEntropy MutualClusteringInfo #' @export -MutualClusteringConcordance <- function (tree, dataset) { +MutualClusteringConcordance <- function(tree, dataset) { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) @@ -567,8 +569,12 @@ MutualClusteringConcordance <- function (tree, dataset) { #' @importFrom utils combn #' @importFrom TreeTools as.Splits PhyDatToMatrix TipLabels #' @export -QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, - return = "mean") { +QuartetConcordance <- function( + tree, + dataset = NULL, + weight = TRUE, + return = "edge" +) { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) @@ -660,7 +666,7 @@ QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, .ExpectedMICache$get(key) } else { ret <- expected_mi(a, b) - + # Cache: .ExpectedMICache$set(key, ret) # Return: @@ -678,10 +684,10 @@ QuartetConcordance <- function(tree, dataset = NULL, weight = TRUE, } #' @rdname SiteConcordance -#' @importFrom TreeTools as.multiPhylo CladisticInfo CompatibleSplits +#' @importFrom TreeTools as.multiPhylo CladisticInfo CompatibleSplits #' MatchStrings #' @export -PhylogeneticConcordance <- function (tree, dataset) { +PhylogeneticConcordance <- function(tree, dataset) { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) @@ -731,7 +737,7 @@ PhylogeneticConcordance <- function (tree, dataset) { #' @importFrom TreeTools as.multiPhylo MatchStrings #' @importFrom TreeDist ClusteringInfo SharedPhylogeneticInfo #' @export -SharedPhylogeneticConcordance <- function (tree, dataset) { +SharedPhylogeneticConcordance <- function(tree, dataset) { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) @@ -752,18 +758,18 @@ SharedPhylogeneticConcordance <- function (tree, dataset) { } #' Evaluate the concordance of information between a tree and a dataset -#' +#' #' Details the amount of information in a phylogenetic dataset that is #' consistent with a specified phylogenetic tree, and the signal:noise #' ratio of the character matrix implied if the tree is true. -#' +#' #' Presently restricted to datasets whose characters contain a maximum of #' two parsimony-informative states. -#' +#' #' @return `ConcordantInformation()` returns a named vector with elements: -#' +#' #' - `informationContent`: cladistic information content of `dataset` -#' - `signal`, `noise`: amount of cladistic information that represents +#' - `signal`, `noise`: amount of cladistic information that represents #' phylogenetic signal and noise, according to `tree` #' - `signalToNoise`: the implied signal:noise ratio of `dataset` #' - `treeInformation`: the cladistic information content of a bifurcating tree @@ -775,7 +781,7 @@ SharedPhylogeneticConcordance <- function (tree, dataset) { #' - `ignored`: information content of characters whose signal and noise could #' not be calculated (too many states) and so are not included in the totals #' above. -#' +#' #' @inheritParams TreeTools::Renumber #' @inheritParams MaximizeParsimony #' @examples @@ -785,16 +791,16 @@ SharedPhylogeneticConcordance <- function (tree, dataset) { #' @template MRS #' @importFrom TreeTools Log2UnrootedMult Log2Unrooted MatchStrings #' @export -ConcordantInformation <- function (tree, dataset) { +ConcordantInformation <- function(tree, dataset) { dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] originalInfo <- sum(apply(PhyDatToMatrix(dataset), 2, CharacterInformation)) dataset <- PrepareDataProfile(dataset) - + extraSteps <- CharacterLength(tree, dataset, compress = TRUE) - MinimumLength(dataset, compress = TRUE) chars <- matrix(unlist(dataset), attr(dataset, "nr")) ambiguousToken <- which(attr(dataset, "allLevels") == "?") - asSplits <- apply(chars, 1, function (x) { + asSplits <- apply(chars, 1, function(x) { ret <- table(x) if (length(ambiguousToken) != 0) { ret[names(ret) != ambiguousToken] @@ -821,8 +827,7 @@ ConcordantInformation <- function (tree, dataset) { }, double(1)) noise <- ic - signal noise[noise < sqrt(.Machine[["double.eps"]])] <- 0 - - + index <- attr(dataset, "index") if (any(is.na(signal))) { na <- is.na(signal) @@ -837,7 +842,7 @@ ConcordantInformation <- function (tree, dataset) { totalNoise <- sum(noise[index], na.rm = TRUE) totalSignal <- sum(signal[index], na.rm = TRUE) signalNoise <- totalSignal / totalNoise - + infoNeeded <- Log2Unrooted(length(dataset)) infoOverkill <- totalInfo / infoNeeded @@ -855,7 +860,7 @@ ConcordantInformation <- function (tree, dataset) { totalSignal <- sum(signal[index]) signalNoise <- totalSignal / totalNoise discarded = 0 - + infoNeeded <- Log2Unrooted(length(dataset)) infoOverkill <- totalInfo / infoNeeded discarded <- originalInfo - totalInfo @@ -872,22 +877,23 @@ ConcordantInformation <- function (tree, dataset) { signif(infoNeeded), " needed. ", "S:N = ", signif(signalNoise), "\n") } - + # Return: - c(informationContent = totalInfo, + c( + informationContent = totalInfo, signal = totalSignal, noise = totalNoise, - signalToNoise = signalNoise, - + signalToNoise = signalNoise, + treeInformation = infoNeeded, matrixToTree = infoOverkill, ignored = discarded - ) + ) } #' @rdname ConcordantInformation #' @export -Evaluate <- function (tree, dataset) { +Evaluate <- function(tree, dataset) { .Deprecated("ConcordantInformation()") ConcordantInformation(tree, dataset) } From dcc314873832ea2c9ffb640410fcdbc564172f61 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 12:41:48 +0000 Subject: [PATCH 26/31] opts --- R/Concordance.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Concordance.R b/R/Concordance.R index ca6510d02..08ea366d7 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -604,11 +604,11 @@ QuartetConcordance <- function( num <- raw_counts$concordant den <- raw_counts$decisive + options <- c("character", "site", "default") return <- options[[pmatch(tolower(trimws(return)), options, nomatch = length(options))]] - options <- c("character", "site", "default") if (return == "default") { if (isTRUE(weight)) { # Sum numerator and denominator across sites (columns), then divide From 25b0859ef17959beb26315ab1a4a77ef40fc5a1d Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 12:45:35 +0000 Subject: [PATCH 27/31] Fix bracket mismatch Whilst cursing the formatter! --- R/Concordance.R | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index 08ea366d7..64c8345c0 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -634,19 +634,21 @@ QuartetConcordance <- function( # return = "char" p <- num / den if (isTRUE(weight)) { - vapply(seq_len(dim(num)[[2]]), function(i) { - weighted.mean(num[, i] / den[, i], den[, i]) - }, double(1)) + vapply( + seq_len(dim(num)[[2]]), function(i) { weighted.mean(num[, i] / den[, i], den[, i]) + }, + double(1) + ) } else { - vapply(seq_len(dim(num)[[2]]), function(i) { - mean(num[den[, i] > 0, i] / den[den[, i] > 0, i]) - }, double(1)) vapply( seq_len(dim(num)[[2]]), + function(i) { + mean(num[den[, i] > 0, i] / den[den[, i] > 0, i]) }, double(1) + ) } } } From 415bf8524db555fce64485d5236ac54c623dfe8c Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 12:45:44 +0000 Subject: [PATCH 28/31] Install to tmp lib --- benchmark/bench-iw.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/benchmark/bench-iw.R b/benchmark/bench-iw.R index 0ebc566c8..aad034a42 100644 --- a/benchmark/bench-iw.R +++ b/benchmark/bench-iw.R @@ -1,4 +1,10 @@ library("TreeTools") + +tmp_lib <- tempfile(pattern = "lib") +dir.create(tmp_lib) +devtools::install(args = paste0("--library=", tmp_lib)) +library("TreeSearch", lib.loc = tmp_lib) + data("congreveLamsdellMatrices", package = "TreeSearch") dataset <- congreveLamsdellMatrices[[42]] tree <- AdditionTree(dataset) |> RenumberTips(names(dataset)) |> RootTree(1) From 8b92f0018363cb63736088ce591b461a63d3333f Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 12:57:45 +0000 Subject: [PATCH 29/31] =?UTF-8?q?;=E2=86=92,?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/Concordance.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Concordance.R b/R/Concordance.R index 64c8345c0..fe76056bd 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -2,7 +2,7 @@ #' #' Concordance measures the strength of support that characters in a dataset #' present for each split (=edge/branch) in a tree -#' \insertCite{Minh2020;SmithConc}{TreeSearch}. +#' \insertCite{Minh2020,SmithConc}{TreeSearch}. #' # # Renumber before MaximizeParsimony, for `tree` #' @inheritParams TreeTools::Renumber From c512407807fbc6dd41cf0c922b327a322d5949e0 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 13:13:35 +0000 Subject: [PATCH 30/31] Expand documentation incl. PhylogeneticConcordance() --- R/Concordance.R | 98 +++++++++++++++++++++++++++++------------- man/SiteConcordance.Rd | 98 +++++++++++++++++++++++++++++------------- 2 files changed, 136 insertions(+), 60 deletions(-) diff --git a/R/Concordance.R b/R/Concordance.R index fe76056bd..c2b133502 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -50,36 +50,51 @@ NULL #' @rdname SiteConcordance #' @details -#' `ClusteringConcordance()` measures how well each split reflects the -#' grouping information present in each character -#' \insertCite{SmithConc}{TreeSearch}. It treats characters and -#' splits as clusterings of taxa, quantifying their agreement using normalized -#' mutual information. Values range from 1 (perfect agreement) to 0 (no more -#' agreement than expected by chance); negative values indicate less agreement -#' than expected. Summaries returned by `return = "edge"` or `"char"` report, -#' respectively, how well each split is supported by all characters, or how -#' well each character is reflected across all splits. -#' -#' @param return Character specifying whether to summarize support per -#' character (`"char"`) or per edge (`"edge"`). See below for details. -#' @param normalize Logical or numeric; if `TRUE` the mutual information will be -#' normalized such that zero corresponds to the expected mutual information of -#' a randomly drawn character with the same distribution of tokens. -#' If `FALSE`, zero will correspond to zero mutual information, -#' even if this is not achievable in practice. -#' The exact analytical solution, though fast, does not account for -#' non-independence between splits. This limitation is minor for larger -#' trees, and becomes negligible for trees with more than ~200 leaves. -#' For smaller trees, the expected value for random trees can be approximated -#' by resampling relabelled trees. Setting `normalize = n` will approximate the -#' expected value based on _n_ samples. -#' -#' For `return = "char"`, `"tree"`, values will be normalized such that 1 -#' corresponds to the maximum possible value, and 0 to the expected value. -#' If `normalize = TRUE`, this will be the expected value for a random -#' character on the given tree. If `normalize` is numeric, the expected value -#' will be estimated by fitting the character to `n` uniformly random trees. -#' +#' `ClusteringConcordance()` measures how well each tree split reflects the +#' grouping structure implied by each character +#' \insertCite{SmithConc}{TreeSearch}. +#' Characters and splits are treated as clusterings of taxa, and their agreement +#' is quantified using mutual information (MI). All reported values are scaled +#' so that 1 corresponds to the maximum possible mutual information for each +#' split–character pair (`hBest`). +#' +#' The `normalize` argument specifies how the zero point is defined. +#' +#' - If `normalize = FALSE`, zero corresponds to *zero* MI, without correcting +#' for the positive bias that arises because MI is rarely exactly zero in +#' finite samples. +#' +#' - If `normalize = TRUE`, the expected MI is computed using an analytical +#' approximation based on the distribution of character tokens. This is fast +#' and generally accurate for large trees (≈200+ taxa), but does not account +#' for correlation between splits. +#' +#' - If `normalize` is a positive integer `n`, the expected MI is estimated +#' empirically by fitting each character to `n` uniformly random trees and +#' averaging the resulting MI values. This Monte Carlo approach provides a +#' more accurate baseline for small trees, for which the analytical +#' approximation is biased. Monte Carlo standard errors are returned. +#' +#' @param return Character specifying the summary to return. Options are: +#' - `"edge"`: average concordance of each tree split across all characters; +#' - `"char"`: concordance of each character averaged over all splits, weighted +#' by each character's information content; +#' - `"tree"`: an overall tree‑level concordance score; +#' - `"all"`: a full array of MI components and normalized values for every +#' split–character pair. +#' +#' Matching is case‑insensitive and partial. +#' +#' @param normalize Controls how the *expected* mutual information (the zero +#' point of the scale) is determined. +#' - `FALSE`: no chance correction; MI is scaled only by its maximum. +#' - `TRUE`: subtract the **analytical** expected MI for random association. +#' - ``: subtract an **empirical** expected MI estimated from that +#' number of random trees. +#' +#' In all cases, 1 corresponds to the maximal attainable MI for the pair +#' (`hBest`), and 0 corresponds to the chosen expectation. +#' #' @returns #' `ClusteringConcordance(return = "all")` returns a 3D array where each #' slice corresponds to a character (site), each column to a tree split, and @@ -686,6 +701,29 @@ QuartetConcordance <- function( } #' @rdname SiteConcordance +#' +#' @details +#' `PhylogeneticConcordance()` treats each character in `dataset` as a +#' phylogenetic hypothesis and measures the extent to which it supports the +#' splits of `tree`. Each character is first interpreted as a tree (or set of +#' trees) in which taxa sharing the same token form a clade. Only splits for +#' which the character contains at least four relevant taxa can contribute +#' information. +#' +#' For each split, the function identifies which characters could potentially +#' support that split (i.e. those for which the induced subtrees contain +#' informative structure), and among these, which characters are actually +#' compatible with the split. The concordance value for each split is the +#' proportion of informative characters that support it. +#' A value of 1 indicates that all characters informative for that subset of +#' taxa support the split; a value of 0 indicates that none do. Characters that +#' contain only ambiguous or uninformative states for the relevant taxa do not +#' affect the result. +#' +#' @return `PhylogeneticConcordance()` returns a numeric vector giving the +#' phylogenetic information of each split in `tree`, named according to the +#' split's internal numbering. +#' #' @importFrom TreeTools as.multiPhylo CladisticInfo CompatibleSplits #' MatchStrings #' @export diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index c1a253eff..dbffef045 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -28,26 +28,29 @@ Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}()}. Additive (ordered) characters can be handled using \code{\link[TreeTools]{Decompose}()}.} -\item{return}{Character specifying whether to summarize support per -character (\code{"char"}) or per edge (\code{"edge"}). See below for details.} - -\item{normalize}{Logical or numeric; if \code{TRUE} the mutual information will be -normalized such that zero corresponds to the expected mutual information of -a randomly drawn character with the same distribution of tokens. -If \code{FALSE}, zero will correspond to zero mutual information, -even if this is not achievable in practice. -The exact analytical solution, though fast, does not account for -non-independence between splits. This limitation is minor for larger -trees, and becomes negligible for trees with more than ~200 leaves. -For smaller trees, the expected value for random trees can be approximated -by resampling relabelled trees. Setting \code{normalize = n} will approximate the -expected value based on \emph{n} samples. - -For \code{return = "char"}, \code{"tree"}, values will be normalized such that 1 -corresponds to the maximum possible value, and 0 to the expected value. -If \code{normalize = TRUE}, this will be the expected value for a random -character on the given tree. If \code{normalize} is numeric, the expected value -will be estimated by fitting the character to \code{n} uniformly random trees.} +\item{return}{Character specifying the summary to return. Options are: +\itemize{ +\item \code{"edge"}: average concordance of each tree split across all characters; +\item \code{"char"}: concordance of each character averaged over all splits, weighted +by each character's information content; +\item \code{"tree"}: an overall tree‑level concordance score; +\item \code{"all"}: a full array of MI components and normalized values for every +split–character pair. +} + +Matching is case‑insensitive and partial.} + +\item{normalize}{Controls how the \emph{expected} mutual information (the zero +point of the scale) is determined. +\itemize{ +\item \code{FALSE}: no chance correction; MI is scaled only by its maximum. +\item \code{TRUE}: subtract the \strong{analytical} expected MI for random association. +\item \verb{}: subtract an \strong{empirical} expected MI estimated from that +number of random trees. +} + +In all cases, 1 corresponds to the maximal attainable MI for the pair +(\code{hBest}), and 0 corresponds to the chosen expectation.} \item{weight}{Logical specifying whether to weight sites according to the number of quartets they are decisive for.} @@ -99,6 +102,10 @@ each corresponding split in \code{tree}. \code{QuartetConcordance(return = "char")} returns a numeric vector giving the concordance index calculated at each site, averaged across all splits. +\code{PhylogeneticConcordance()} returns a numeric vector giving the +phylogenetic information of each split in \code{tree}, named according to the +split's internal numbering. + \code{SharedPhylogeneticConcordance()} returns the shared phylogenetic concordance of each character in \code{dataset} with \code{tree}. The attribute \code{weighted.mean} gives the mean value, weighted by the @@ -107,18 +114,32 @@ information content of each character. \description{ Concordance measures the strength of support that characters in a dataset present for each split (=edge/branch) in a tree -\insertCite{Minh2020;SmithConc}{TreeSearch}. +\insertCite{Minh2020,SmithConc}{TreeSearch}. } \details{ -\code{ClusteringConcordance()} measures how well each split reflects the -grouping information present in each character -\insertCite{SmithConc}{TreeSearch}. It treats characters and -splits as clusterings of taxa, quantifying their agreement using normalized -mutual information. Values range from 1 (perfect agreement) to 0 (no more -agreement than expected by chance); negative values indicate less agreement -than expected. Summaries returned by \code{return = "edge"} or \code{"char"} report, -respectively, how well each split is supported by all characters, or how -well each character is reflected across all splits. +\code{ClusteringConcordance()} measures how well each tree split reflects the +grouping structure implied by each character +\insertCite{SmithConc}{TreeSearch}. +Characters and splits are treated as clusterings of taxa, and their agreement +is quantified using mutual information (MI). All reported values are scaled +so that 1 corresponds to the maximum possible mutual information for each +split–character pair (\code{hBest}). + +The \code{normalize} argument specifies how the zero point is defined. +\itemize{ +\item If \code{normalize = FALSE}, zero corresponds to \emph{zero} MI, without correcting +for the positive bias that arises because MI is rarely exactly zero in +finite samples. +\item If \code{normalize = TRUE}, the expected MI is computed using an analytical +approximation based on the distribution of character tokens. This is fast +and generally accurate for large trees (≈200+ taxa), but does not account +for correlation between splits. +\item If \code{normalize} is a positive integer \code{n}, the expected MI is estimated +empirically by fitting each character to \code{n} uniformly random trees and +averaging the resulting MI values. This Monte Carlo approach provides a +more accurate baseline for small trees, for which the analytical +approximation is biased. Monte Carlo standard errors are returned. +} \code{MutualClusteringConcordance()} provides a character‑wise summary that emphasises each character’s best‑matching split(s). It treats each character @@ -154,6 +175,23 @@ of quartets for a faster, if potentially less accurate, computation. Ambiguous and inapplicable tokens are treated as containing no grouping information (i.e. \code{(02)} or \code{-} are each treated as \verb{?}). +\code{PhylogeneticConcordance()} treats each character in \code{dataset} as a +phylogenetic hypothesis and measures the extent to which it supports the +splits of \code{tree}. Each character is first interpreted as a tree (or set of +trees) in which taxa sharing the same token form a clade. Only splits for +which the character contains at least four relevant taxa can contribute +information. + +For each split, the function identifies which characters could potentially +support that split (i.e. those for which the induced subtrees contain +informative structure), and among these, which characters are actually +compatible with the split. The concordance value for each split is the +proportion of informative characters that support it. +A value of 1 indicates that all characters informative for that subset of +taxa support the split; a value of 0 indicates that none do. Characters that +contain only ambiguous or uninformative states for the relevant taxa do not +affect the result. + \code{SharedPhylogeneticConcordance()} treats each character as a simple tree. Each token in the character corresponds to a node whose pendant edges are the taxa with that token. From f91f38156997c3a301ab74064268fc9126b3b755 Mon Sep 17 00:00:00 2001 From: RevBayes analysis <1695515+ms609@users.noreply.github.com> Date: Thu, 15 Jan 2026 13:28:57 +0000 Subject: [PATCH 31/31] v1.8.0 --- DESCRIPTION | 2 +- NEWS.md | 8 ++---- codemeta.json | 80 +++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 61 insertions(+), 29 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 992de6f1b..f5f882b64 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeSearch Title: Phylogenetic Analysis with Discrete Character Data -Version: 1.7.0.9001 +Version: 1.8.0 Authors@R: c( person( "Martin R.", 'Smith', diff --git a/NEWS.md b/NEWS.md index fbbacc43b..9753419f6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,12 +1,8 @@ -# Branch `char-concord` (development) +# TreeSearch 1.8.0 (2025-01-15) - Implements the methods of Smith (forthcoming) via `ClusteringConcordance()`, - visualized with `ConcordanceTable()` and support functions `QACol()`, `QALegend()` + with visualization functions `ConcordanceTable()`, `QACol()` and `QALegend()`. - `QuartetConcordance()` gains `return` parameter and fast C++ implementation. - - -# TreeSearch 1.7.0.9000 (development) - - Fix regression in `MaximumLength()`. diff --git a/codemeta.json b/codemeta.json index c5d4941cc..77696cabf 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,13 +8,13 @@ "codeRepository": "https://github.com/ms609/TreeSearch/", "issueTracker": "https://github.com/ms609/TreeSearch/issues/", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "1.7.0", + "version": "1.8.0", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", "url": "https://r-project.org" }, - "runtimePlatform": "R version 4.5.1 (2025-06-13)", + "runtimePlatform": "R version 4.5.2 (2025-10-31 ucrt)", "provider": { "@id": "https://cran.r-project.org", "@type": "Organization", @@ -174,6 +174,18 @@ "version": ">= 4.0" }, "2": { + "@type": "SoftwareApplication", + "identifier": "abind", + "name": "abind", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=abind" + }, + "3": { "@type": "SoftwareApplication", "identifier": "ape", "name": "ape", @@ -186,7 +198,19 @@ }, "sameAs": "https://CRAN.R-project.org/package=ape" }, - "3": { + "4": { + "@type": "SoftwareApplication", + "identifier": "base64enc", + "name": "base64enc", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=base64enc" + }, + "5": { "@type": "SoftwareApplication", "identifier": "cli", "name": "cli", @@ -199,7 +223,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=cli" }, - "4": { + "6": { "@type": "SoftwareApplication", "identifier": "cluster", "name": "cluster", @@ -211,7 +235,19 @@ }, "sameAs": "https://CRAN.R-project.org/package=cluster" }, - "5": { + "7": { + "@type": "SoftwareApplication", + "identifier": "colorspace", + "name": "colorspace", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=colorspace" + }, + "8": { "@type": "SoftwareApplication", "identifier": "fastmap", "name": "fastmap", @@ -223,7 +259,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=fastmap" }, - "6": { + "9": { "@type": "SoftwareApplication", "identifier": "fastmatch", "name": "fastmatch", @@ -236,7 +272,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=fastmatch" }, - "7": { + "10": { "@type": "SoftwareApplication", "identifier": "fs", "name": "fs", @@ -248,7 +284,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=fs" }, - "8": { + "11": { "@type": "SoftwareApplication", "identifier": "future", "name": "future", @@ -260,7 +296,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=future" }, - "9": { + "12": { "@type": "SoftwareApplication", "identifier": "PlotTools", "name": "PlotTools", @@ -272,7 +308,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=PlotTools" }, - "10": { + "13": { "@type": "SoftwareApplication", "identifier": "promises", "name": "promises", @@ -284,7 +320,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=promises" }, - "11": { + "14": { "@type": "SoftwareApplication", "identifier": "protoclust", "name": "protoclust", @@ -296,7 +332,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=protoclust" }, - "12": { + "15": { "@type": "SoftwareApplication", "identifier": "Rcpp", "name": "Rcpp", @@ -308,7 +344,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=Rcpp" }, - "13": { + "16": { "@type": "SoftwareApplication", "identifier": "Rdpack", "name": "Rdpack", @@ -321,7 +357,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=Rdpack" }, - "14": { + "17": { "@type": "SoftwareApplication", "identifier": "Rogue", "name": "Rogue", @@ -334,7 +370,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=Rogue" }, - "15": { + "18": { "@type": "SoftwareApplication", "identifier": "shiny", "name": "shiny", @@ -347,7 +383,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=shiny" }, - "16": { + "19": { "@type": "SoftwareApplication", "identifier": "shinyjs", "name": "shinyjs", @@ -359,12 +395,12 @@ }, "sameAs": "https://CRAN.R-project.org/package=shinyjs" }, - "17": { + "20": { "@type": "SoftwareApplication", "identifier": "stats", "name": "stats" }, - "18": { + "21": { "@type": "SoftwareApplication", "identifier": "stringi", "name": "stringi", @@ -376,7 +412,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=stringi" }, - "19": { + "22": { "@type": "SoftwareApplication", "identifier": "TreeDist", "name": "TreeDist", @@ -389,7 +425,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=TreeDist" }, - "20": { + "23": { "@type": "SoftwareApplication", "identifier": "TreeTools", "name": "TreeTools", @@ -404,7 +440,7 @@ }, "SystemRequirements": "C++17" }, - "fileSize": "4584.9KB", + "fileSize": "4632.484KB", "citation": [ { "@type": "SoftwareSourceCode", @@ -476,7 +512,7 @@ ], "name": "{TreeSearch}: phylogenetic analysis with discrete character data", "identifier": "10.5281/zenodo.1042590", - "description": "R package version 1.7.0", + "description": "R package version 1.8.0", "@id": "https://doi.org/10.5281/zenodo.1042590", "sameAs": "https://doi.org/10.5281/zenodo.1042590" },