diff --git a/DESCRIPTION b/DESCRIPTION index 911f634f8..f5f882b64 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeSearch Title: Phylogenetic Analysis with Discrete Character Data -Version: 1.7.0 +Version: 1.8.0 Authors@R: c( person( "Martin R.", 'Smith', @@ -33,10 +33,13 @@ URL: https://ms609.github.io/TreeSearch/ (doc), https://github.com/ms609/TreeSearch/ (devel) BugReports: https://github.com/ms609/TreeSearch/issues/ Depends: R (>= 4.0) -Imports: +Imports: + abind, ape (>= 5.6), + base64enc, cli (>= 3.0), cluster, + colorspace, fastmap, fastmatch (>= 1.1.3), fs, diff --git a/NAMESPACE b/NAMESPACE index 79e40d690..e542e3959 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ export(Carter1) export(CharacterLength) export(ClusterStrings) export(ClusteringConcordance) +export(ConcordanceTable) export(ConcordantInfo) export(ConcordantInformation) export(Consistency) @@ -68,6 +69,9 @@ export(PolEscapa) export(PrepareDataIW) export(PrepareDataProfile) export(PresCont) +export(QACol) +export(QALegend) +export(QCol) export(QuartetConcordance) export(QuartetResolution) export(RandomMorphyTree) @@ -140,9 +144,11 @@ importFrom(TreeDist,ClusteringInfoDistance) importFrom(TreeDist,Entropy) importFrom(TreeDist,MutualClusteringInfo) importFrom(TreeDist,SharedPhylogeneticInfo) +importFrom(TreeDist,entropy_int) importFrom(TreeTools,AddTipEverywhere) importFrom(TreeTools,AddUnconstrained) importFrom(TreeTools,CharacterInformation) +importFrom(TreeTools,CladeSizes) importFrom(TreeTools,CladisticInfo) importFrom(TreeTools,CompatibleSplits) importFrom(TreeTools,Consensus) @@ -159,6 +165,7 @@ importFrom(TreeTools,Log2Unrooted) importFrom(TreeTools,Log2UnrootedMult) importFrom(TreeTools,LogDoubleFactorial) importFrom(TreeTools,MakeTreeBinary) +importFrom(TreeTools,MatchStrings) importFrom(TreeTools,MatrixToPhyDat) importFrom(TreeTools,NRooted) importFrom(TreeTools,NSplits) @@ -189,6 +196,7 @@ importFrom(TreeTools,TipLabels) importFrom(TreeTools,TreeIsRooted) importFrom(TreeTools,as.Splits) importFrom(TreeTools,as.multiPhylo) +importFrom(abind,abind) importFrom(ape,consensus) importFrom(ape,keep.tip) importFrom(ape,nodelabels) @@ -196,6 +204,7 @@ importFrom(ape,plot.phylo) importFrom(ape,read.nexus) importFrom(ape,root) importFrom(ape,write.nexus) +importFrom(base64enc,base64encode) importFrom(cli,cli_alert) importFrom(cli,cli_alert_danger) importFrom(cli,cli_alert_info) @@ -207,11 +216,17 @@ importFrom(cli,cli_progress_done) importFrom(cli,cli_progress_update) importFrom(cluster,pam) importFrom(cluster,silhouette) +importFrom(colorspace,hex) +importFrom(colorspace,max_chroma) +importFrom(colorspace,polarLUV) importFrom(fastmap,fastmap) importFrom(fastmatch,"%fin%") importFrom(fastmatch,fmatch) importFrom(fs,path_sanitize) importFrom(future,future) +importFrom(graphics,abline) +importFrom(graphics,image) +importFrom(graphics,mtext) importFrom(graphics,par) importFrom(promises,future_promise) importFrom(protoclust,protoclust) diff --git a/NEWS.md b/NEWS.md index a08a957e6..9753419f6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ -# TreeSearch 1.7.0.9000 (development) +# TreeSearch 1.8.0 (2025-01-15) +- Implements the methods of Smith (forthcoming) via `ClusteringConcordance()`, + with visualization functions `ConcordanceTable()`, `QACol()` and `QALegend()`. +- `QuartetConcordance()` gains `return` parameter and fast C++ implementation. - Fix regression in `MaximumLength()`. @@ -14,6 +17,7 @@ - Support single-character matrices in `ClusteringConcordance()` - Fix `DoNothing(x)` to return `x` (not `NULL`) - Remove unused `delete_rawdata()` due to implementation issues. +- Port `MaximumLength()` to C++ to handle more characters, more efficiently. # TreeSearch 1.6.1 (2025-06-10) diff --git a/R/Concordance.R b/R/Concordance.R index c5c76641b..c2b133502 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -1,96 +1,595 @@ -#' Calculate site concordance factor -#' -#' The site concordance factor \insertCite{Minh2020}{TreeSearch} is a measure -#' of the strength of support that the dataset presents for a given split in a -#' tree. -#' -#' `QuartetConcordance()` is the proportion of quartets (sets of four leaves) -#' that are decisive for a split which are also concordant with it. -#' For example, a quartet with the characters `0 0 0 1` is not decisive, as -#' all relationships between those leaves are equally parsimonious. -#' But a quartet with characters `0 0 1 1` is decisive, and is concordant -#' with any tree that groups the first two leaves together to the exclusion -#' of the second. -#' -#' By default, the reported value weights each site by the number of quartets -#' it is decisive for. This value can be interpreted as the proportion of -#' all decisive quartets that are concordant with a split. -#' If `weight = FALSE`, the reported value is the mean of the concordance -#' value for each site. -#' Consider a split associated with two sites: -#' one that is concordant with 25% of 96 decisive quartets, and -#' a second that is concordant with 75% of 4 decisive quartets. -#' If `weight = TRUE`, the split concordance will be 24 + 3 / 96 + 4 = 27%. -#' If `weight = FALSE`, the split concordance will be mean(75%, 25%) = 50%. -#' -#' `QuartetConcordance()` is computed exactly, using all quartets, where as -#' other implementations (e.g. IQ-TREE) follow -#' \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample -#' of quartets for a faster, if potentially less accurate, computation. -#' -# `ClusteringConcordance()` and `PhylogeneticConcordance()` respectively report -# the proportion of clustering information and phylogenetic information -# \insertCite{@as defined in @Vinh2010, @SmithDist}{TreeDist} within a dataset -# that is reflected in each split. -# These give smaller values because a split may be compatible with a character -# without being identical to it. -#TODO More thought / explanation needed. -#' -#TODO Finally, `ProfileConcordance()` (to follow) -#' -#' **NOTE:** These functions are under development. They are incompletely -#' tested, and may change without notice. -#' Complete documentation and discussion will follow in due course. -#' +#' Concordance factors +#' +#' Concordance measures the strength of support that characters in a dataset +#' present for each split (=edge/branch) in a tree +#' \insertCite{Minh2020,SmithConc}{TreeSearch}. +#' # # Renumber before MaximizeParsimony, for `tree` #' @inheritParams TreeTools::Renumber #' @inheritParams MaximizeParsimony -#' @param weight Logical specifying whether to weight sites according to the -#' number of quartets they are decisive for. -#' -#' -#' -#' @references +#' +#' @references #' \insertAllCited{} -#' -#' @examples +#' +#' @examples #' data("congreveLamsdellMatrices", package = "TreeSearch") #' dataset <- congreveLamsdellMatrices[[1]][, 1:20] -#' tree <- referenceTree -#' qc <- QuartetConcordance(tree, dataset) +#' tree <- TreeSearch::referenceTree +#' #' cc <- ClusteringConcordance(tree, dataset) +#' mcc <- MutualClusteringConcordance(tree, dataset) +#' +#' qc <- QuartetConcordance(tree, dataset) +#' #' pc <- PhylogeneticConcordance(tree, dataset) #' spc <- SharedPhylogeneticConcordance(tree, dataset) -#' mcc <- MutualClusteringConcordance(tree, dataset) -#' +#' #' oPar <- par(mar = rep(0, 4), cex = 0.8) # Set plotting parameters #' plot(tree) #' TreeTools::LabelSplits(tree, signif(qc, 3), cex = 0.8) #' plot(tree) #' TreeTools::LabelSplits(tree, signif(cc, 3), cex = 0.8) #' par(oPar) # Restore plotting parameters -#' +#' #' # Write concordance factors to file -#' labels <- paste0(qc, "/", cc, "/", pc) # "/" is a valid delimiter +#' labels <- paste0(cc, "/", qc, "/", pc) # "/" is a valid delimiter #' # Identify the node that corresponds to each label #' whichNode <- match(TreeTools::NTip(tree) + 1:tree$Nnode, names(qc)) -#' +#' #' # The contents of tree$node.label will be written at each node #' tree$node.label <- labels[whichNode] -#' +#' #' ape::write.tree(tree) # or write.nexus(tree, file = "mytree.nex") -#' +#' #' # Display correlation between concordance factors -#' pairs(cbind(qc, cc, pc, spc, mcc), asp = 1) +#' pairs(cbind(cc, mcc, qc, pc, spc), asp = 1) +#' @template MRS +#' @family split support functions +#' @name SiteConcordance +NULL + +#' @rdname SiteConcordance +#' @details +#' `ClusteringConcordance()` measures how well each tree split reflects the +#' grouping structure implied by each character +#' \insertCite{SmithConc}{TreeSearch}. +#' Characters and splits are treated as clusterings of taxa, and their agreement +#' is quantified using mutual information (MI). All reported values are scaled +#' so that 1 corresponds to the maximum possible mutual information for each +#' split–character pair (`hBest`). +#' +#' The `normalize` argument specifies how the zero point is defined. +#' +#' - If `normalize = FALSE`, zero corresponds to *zero* MI, without correcting +#' for the positive bias that arises because MI is rarely exactly zero in +#' finite samples. +#' +#' - If `normalize = TRUE`, the expected MI is computed using an analytical +#' approximation based on the distribution of character tokens. This is fast +#' and generally accurate for large trees (≈200+ taxa), but does not account +#' for correlation between splits. +#' +#' - If `normalize` is a positive integer `n`, the expected MI is estimated +#' empirically by fitting each character to `n` uniformly random trees and +#' averaging the resulting MI values. This Monte Carlo approach provides a +#' more accurate baseline for small trees, for which the analytical +#' approximation is biased. Monte Carlo standard errors are returned. +#' +#' @param return Character specifying the summary to return. Options are: +#' - `"edge"`: average concordance of each tree split across all characters; +#' - `"char"`: concordance of each character averaged over all splits, weighted +#' by each character's information content; +#' - `"tree"`: an overall tree‑level concordance score; +#' - `"all"`: a full array of MI components and normalized values for every +#' split–character pair. +#' +#' Matching is case‑insensitive and partial. +#' +#' @param normalize Controls how the *expected* mutual information (the zero +#' point of the scale) is determined. +#' - `FALSE`: no chance correction; MI is scaled only by its maximum. +#' - `TRUE`: subtract the **analytical** expected MI for random association. +#' - ``: subtract an **empirical** expected MI estimated from that +#' number of random trees. +#' +#' In all cases, 1 corresponds to the maximal attainable MI for the pair +#' (`hBest`), and 0 corresponds to the chosen expectation. +#' +#' @returns +#' `ClusteringConcordance(return = "all")` returns a 3D array where each +#' slice corresponds to a character (site), each column to a tree split, and +#' each row to a different information measure. The `normalized` row gives the +#' normalized mutual information between each split-character pair, scaled so +#' that 1.0 corresponds to `hBest` (the theoretical maximum mutual information, +#' being the minimum of `hSplit` and `hChar`) and 0.0 corresponds to `miRand` +#' (the expected mutual information under random association). `hSplit` gives +#' the entropy (information content) of each split's bipartition; `hChar` gives +#' the entropy of each character's state distribution; `hJoint` gives the joint +#' entropy of the split-character confusion matrix; `mi` gives the raw mutual +#' information; and `n` records the number of informative observations. +#' Negative normalized values indicate observed mutual information below random +#' expectation. `NA` is returned when `hBest = 0` (no information potential). +#' +#' `ClusteringConcordance(return = "edge")` returns a vector where each element +#' corresponds to a split (an edge of the tree) and gives the normalized mutual +#' information between that split and the character data, averaged across all +#' characters. +#' When `normalize = TRUE` (default), values are scaled relative to random +#' expectation; when `FALSE`, raw mutual information normalized by `hBest` is +#' returned. +#' +#' `ClusteringConcordance(return = "char")` returns a vector where each element +#' corresponds to a character (site) and gives the entropy-weighted average +#' normalized mutual information between that character and all tree splits. +#' Characters with higher information content receive proportionally more weight +#' from splits that can potentially convey more information about them. +#' +#' `ClusteringConcordance(return = "tree")` returns a single value representing +#' the overall concordance between the tree topology and the character data. +#' This averages the fit of the best-matching split for each character. +#' This is included for completeness, though it is not clear that this is a useful +#' or meaningful measure. +# +# I had previously considered calculating the entropy-weighted average of normalized +# mutual information across all split-character pairs, where each pair contributes +# proportionally to its potential information content. +# The problem here is that imperfect matches between compatible splits +# come to dominate, resulting in a small score that gets smaller as trees get +# larger, even with a perfect fit. +#' +#' +#' @seealso +#' - [Consistency()] +#' @examples +#' data(congreveLamsdellMatrices) +#' myMatrix <- congreveLamsdellMatrices[[10]] +#' ClusteringConcordance(TreeTools::NJTree(myMatrix), myMatrix) +#' @template MRS +#' @importFrom abind abind +#' @importFrom stats setNames +#' @importFrom TreeDist ClusteringEntropy Entropy entropy_int +#' MutualClusteringInfo +#' @importFrom TreeTools as.Splits MatchStrings Subsplit TipLabels +#' @export +ClusteringConcordance <- function( + tree, + dataset, + return = "edge", + normalize = TRUE +) { + # Check inputs + if (is.null(dataset)) { + warning("Cannot calculate concordance without `dataset`.") + return(NULL) + } + if (is.null(tree)) { + warning("Cannot calculate concordance without `dataset`.") + return(NULL) + } + + keep <- MatchStrings(TipLabels(tree), names(dataset), warning) + if (length(keep) == 0) { + return(NULL) + } + dataset <- dataset[keep] + + # Prepare data + splits <- as.logical(as.Splits(tree)) + + at <- attributes(dataset) + cont <- at[["contrast"]] + if ("-" %in% colnames(cont)) { + cont[cont[, "-"] > 0, ] <- 1 + } + ambiguous <- rowSums(cont) != 1 + + mat <- matrix(as.integer(unlist(dataset)), length(dataset), byrow = TRUE) + mat[mat %in% which(ambiguous)] <- NA_integer_ + maxToken <- max(mat, na.rm = TRUE) + tokens <- as.character(seq_len(maxToken)) + mat <- apply(mat, 2, function(x) { + uniques <- tabulate(x, maxToken) == 1 + x[x %in% tokens[uniques]] <- NA_integer_ + x + }) + + # Calculate entropy + h <- simplify2array(apply(mat, 2, function(char) { + aChar <- !is.na(char) + ch <- char[aChar] + if (length(ch) == 0) { + # All ambiguous + n <- 0 + hChar <- 0 + } else { + chMax <- max(1, ch) + chTable <- tabulate(ch, chMax) + n <- length(ch) + hChar <- entropy_int(chTable) + } + + hh <- apply(splits[, aChar, drop = FALSE], 1, function (spl) { + spTable <- tabulate(spl + 1, 2) + if (any(spTable < 2)) { + c(hSplit = 0, + hJoint = hChar, + miRand = 0, + n = n) + } else { + c(hSplit = entropy_int(spTable), + hJoint = entropy_int(tabulate(ch + (spl * chMax), chMax + chMax)), + miRand = .ExpectedMI(spTable, chTable), + n = n) + } + }) + + rbind(hChar = hChar, hh) + }, simplify = FALSE)) + + if (length(dim(h)) == 2) { + # Matrix to 3D array + dim(h) <- c(dim(h), 1) + } + + h[abs(h) < sqrt(.Machine$double.eps)] <- 0 + hh <- h[, , at[["index"]], drop = FALSE] + + hBest <- `rownames<-`(pmin(hh["hChar", , , drop = FALSE], + hh["hSplit", , , drop = FALSE]), NULL) + mi <- `rownames<-`(hh["hChar", , , drop = FALSE] + + hh["hSplit", , , drop = FALSE] - + hh["hJoint", , , drop = FALSE], NULL) + miRand <- `rownames<-`(hh["miRand", , , drop = FALSE], NULL) + norm <- if (isFALSE(normalize)) { + ifelse(hBest == 0, NA, mi / hBest) + } else { + ifelse(hBest == 0, NA, .Rezero(mi / hBest, miRand / hBest)) + } + + returnType <- pmatch(tolower(return), c("all", "edge", "character", "tree"), + nomatch = 1L) + if (returnType %in% 3:4) { # character / tree + charSplits <- apply(mat, 2, simplify = FALSE, function(x) + as.Splits(x[!is.na(x)], tipLabels = keep[!is.na(x)])) + charMax <- vapply(charSplits, ClusteringEntropy, double(1))[ + attr(dataset, "index")] + charInfo <- MutualClusteringInfo(tree, charSplits)[at[["index"]]] + if (is.numeric(normalize)) { + rTrees <- replicate(normalize, RandomTree(tree), simplify = FALSE) + randInfo <- MutualClusteringInfo(rTrees, charSplits)[, attr(dataset, "index")] + randMean <- colMeans(randInfo) + var <- rowSums((t(randInfo) - randMean) ^ 2) / (normalize - 1) + mcse <- sqrt(var / normalize) + randTreeInfo <- rowSums(randInfo) + randTreeMean <- mean(randTreeInfo) + treeVar <- var(randTreeInfo) + mcseTree <- sqrt(treeVar / normalize) + } + } + + # Return: + switch(returnType, + # all + abind( + along = 1, + normalized = norm, + hh, + hBest = hBest, + mi = mi, + miRand = miRand + ), { + # edge + best <- rowSums(hBest[1, , , drop = FALSE], dims = 2) + ifelse(!is.na(best) & best == 0, + NA_real_, + if (isTRUE(normalize)) { + .Rezero( + rowSums(mi[1, , , drop = FALSE], dims = 2) / best, + rowSums(miRand[1, , , drop = FALSE], dims = 2) / best + ) + } else { + rowSums(mi[1, , , drop = FALSE], dims = 2) / best + })[1, ] + }, { + # char + + # one <- hh["hChar", 1, , drop = TRUE] # All rows equal + one <- charMax + zero <- if (isFALSE(normalize)) { + 0 + } else if (isTRUE(normalize)) { + apply(hh["miRand", , ], 2, max) + } else { + randMean + } + ret <- (charInfo - zero) / (one - zero) + if (is.numeric(normalize)) { + mcseInfo <- ((one - charInfo) / (one - zero) ^ 2) * mcse + mcseInfo[mcseInfo < sqrt(.Machine$double.eps)] <- 0 + structure(ret, hMax = charMax, mcse = mcseInfo) + } else { + structure(charInfo / charMax, hMax = charMax) + } + }, { + # tree: Entropy-weighted mean across best character-split pairs + warning("I'm not aware of a situation in which this is a useful measure.") + # Random normalization doesn't work if we pick the best matching + # split; one will be randomly better than another, even in the + # absence of signal. + norm[is.na(norm)] <- 0 + bestMatch <- apply(norm, 3, which.max) + idx <- cbind(1, bestMatch, seq_along(bestMatch)) + return(weighted.mean(norm[idx], hBest[idx])) + + if (isFALSE(normalize)) { + } else { + one <- sum(hh["hChar", 1, ]) + zero <- if (isTRUE(normalize)) { + sum(apply(hh["miRand", , ], 2, max)) + } else { + randTreeMean + } + ret <- (sum(charInfo) - zero) / (one - zero) + if (is.numeric(normalize)) { + mcseInfo <- ((one - sum(charInfo)) / (one - zero) ^ 2) * mcseTree + mcseInfo[mcseInfo < sqrt(.Machine$double.eps)] <- 0 + structure(ret, mcse = mcseInfo) + } else { + ret + } + } + } + ) +} + +#' Generate colour to depict the amount and quality of observations +#' @param amount Numeric vector of values between 0 and 1, denoting the relative +#' amount of information +#' @param quality Numeric vector of values between -1 and 1, denoting the +#' quality of observations, where 0 is neutral. +#' @return `QACol()` returns an RGB hex code for a colour, where lighter colours +#' correspond to entries with a higher `amount`; unsaturated colours denote +#' a neutral `quality`; and red/cyan colours denote low/high `quality`. +#' @examples +#' amount <- runif(80, 0, 1) +#' quality <- runif(80, -1, 1) +#' plot(amount, quality, col = QACol(amount, quality), pch = 15) +#' abline(h = 0) #' @template MRS +#' @importFrom colorspace hex polarLUV +#' @export +QACol <- function(amount, quality) { + h <- 80 + (quality * 140) + l <- amount * 88 # < 100: white can take no hue + c <- abs(quality) * .MaxChroma(h, l) + # Saturation higher than 1 risks overflowing the colour space + # Small overflows are caught via `fixup = TRUE`; large overflows will produce + # bright red errors + saturation <- 0.999 # Safe if max_chroma(floor = FALSE) slightly overestimates + saturation <- 1.16 + + hex(polarLUV( + H = as.numeric(h), + C = as.numeric(c) * saturation, + L = as.numeric(l) + ), fixup = TRUE) +} + +#' @importFrom colorspace max_chroma +.MaxChroma <- function(h, l) { + ret <- `length<-`(double(0), length(h)) + applicable <- !is.na(h) & !is.na(l) + ret[applicable] <- max_chroma(h[applicable], l[applicable]) + ret +} + +#' @rdname QACol +#' @return `QCol()` returns an RGB hex code for a colour, where darker, +#' unsaturated colours denote a neutral `quality`; +#' and red/cyan colours denote low/high `quality`. `amount` is ignored. +#' @export +QCol <- function(amount, quality) { + h <- 80 + (quality * 140) + l <- abs(quality) * 88 # < 100: white can take no hue + c <- abs(quality) * .MaxChroma(h, l) + # Saturation higher than 1 risks overflowing the colour space + # Small overflows are caught via `fixup = TRUE`; large overflows will produce + # bright red errors + saturation <- 0.999 # Safe if max_chroma(floor = FALSE) slightly overestimates + + hex(polarLUV( + H = as.numeric(h), + C = as.numeric(c) * saturation, + L = as.numeric(l) + ), fixup = TRUE) +} + +#' @rdname QACol +#' @param where Location of legend, passed to `par(fig = where)` +#' @param n Integer vector giving number of cells to plot in swatch for +#' `quality` and `amount`. +#' @inheritParams ConcordanceTable +#' @export +QALegend <- function(where = c(0.1, 0.3, 0.1, 0.3), n = 5, Col = QACol) { + oPar <- par(fig = where, new = TRUE, mar = rep(0, 4), xpd = NA) + on.exit(par(oPar)) + n <- rep(n, length.out = 2) + nA <- n[[2]] + nQ <- n[[1]] + amount <- seq(0, 1, length.out = nA) + quality <- seq(-1, 1, length.out = nQ) + mat <- outer(amount, quality, + Vectorize(function (a, q) Col(a, q))) + image(x = amount, y = quality, z = matrix(1:prod(n), nA, nQ), + col = mat, axes = FALSE, xlab = "", ylab = "") + mtext("Amount \U2192", side = 1, line = 1) + mtext("Quality \U2192", side = 2, line = 1) +} + +#' Plot concordance table +#' +#' `ConcordanceTable()` plots a concordance table +#' \insertCite{SmithConc}{TreeSearch}. +#' +#' @inheritParams ClusteringConcordance +#' @param Col Function that takes vectors `amount` and `quality` and returns +#' a vector of colours. [QCol] colours by data quality (concordance); +#' [QACol] by quality and amount of information. +#' @param largeClade Integer; if greater than 1, vertical lines will be drawn +#' at edges whose descendants are both contain more than `largeClade` leaves. +#' @param xlab Character giving a label for the x axis. +#' @param ylab Character giving a label for the y axis. +#' @param \dots Arguments to `abline`, to control the appearance of vertical +#' lines marking important edges. +#' @returns `ConcordanceTable()` invisibly returns an named list containing: +#' - `"info"`: The amount of information in each character-edge pair, in bits; +#' - `"relInfo"`: The information, normalized to the most information-rich pair; +#' - `"quality"`: The normalized mutual information of the pair; +#' - `"col"`: The colours used to plot the table. +#' +#' @references \insertAllCited{} +#' @examples +#' # Load data and tree +#' data("congreveLamsdellMatrices", package = "TreeSearch") +#' dataset <- congreveLamsdellMatrices[[1]][, 1:20] +#' tree <- referenceTree +#' +#' # Plot tree and identify nodes +#' library("TreeTools", quietly = TRUE) +#' plot(tree) +#' nodeIndex <- as.integer(rownames(as.Splits(tree))) +#' nodelabels(seq_along(nodeIndex), nodeIndex, adj = c(2, 1), +#' frame = "none", bg = NULL) +#' QALegend(where = c(0.1, 0.4, 0.1, 0.3)) +#' +#' # View information shared by characters and edges +#' ConcordanceTable(tree, dataset, largeClade = 3, col = 2, lwd = 3) +#' axis(1) +#' axis(2) +#' +#' # Visualize dataset +#' image(t(`mode<-`(PhyDatToMatrix(dataset), "numeric")), axes = FALSE, +#' xlab = "Leaf", ylab = "Character") +#' @importFrom graphics abline image mtext +#' @importFrom TreeTools CladeSizes NTip +#' @export +ConcordanceTable <- function(tree, dataset, Col = QACol, largeClade = 0, + xlab = "Edge", ylab = "Character", ...) { + cc <- ClusteringConcordance(tree, dataset, return = "all") + nodes <- seq_len(dim(cc)[[2]]) + info <- cc["hBest", , ] * cc["n", , ] + amount <- info / max(info, na.rm = TRUE) + amount[is.na(amount)] <- 0 + quality <- cc["normalized", , ] + # Plot points with incalculable quality as black, not transparent. + amount[is.na(quality)] <- 0 + quality[is.na(quality)] <- 0 + + col <- matrix(Col(amount, quality), dim(amount)[[1]], dim(amount)[[2]]) + image(nodes, seq_len(dim(cc)[[3]]), + matrix(1:prod(dim(amount)), dim(amount)[[1]]), + frame.plot = FALSE, axes = FALSE, + col = col, xlab = xlab, ylab = ylab) + + if (largeClade > 1) { + cladeSize <- CladeSizes(tree) + edge <- tree[["edge"]] + parent <- edge[, 1] + child <- edge[, 2] + bigNode <- vapply(as.integer(colnames(cc)), function (node) { + all(cladeSize[child[parent == parent[child == node]]] >= largeClade) + }, logical(1)) + abline(v = nodes[bigNode] - 0.5, ...) + } + invisible(list(info = info, amount = amount, quality = quality, col = col)) +} + +#' @rdname SiteConcordance +#' @details +#' `MutualClusteringConcordance()` provides a character‑wise summary that +#' emphasises each character’s best‑matching split(s). It treats each character +#' as a simple tree and computes the mutual clustering information between this +#' character‑tree and the supplied phylogeny. High values identify characters +#' whose signal is well represented anywhere in the tree, even if concentrated +#' on a single edge. +#' +#' @return `MutualClusteringConcordance()` returns the mutual clustering +#' concordance of each character in `dataset` with `tree`. +#' The attribute `weighted.mean` gives the mean value, weighted by the +#' information content of each character. +#' @importFrom TreeTools MatchStrings +#' @importFrom TreeDist ClusteringEntropy MutualClusteringInfo +#' @export +MutualClusteringConcordance <- function(tree, dataset) { + if (is.null(dataset)) { + warning("Cannot calculate concordance without `dataset`.") + return(NULL) + } + + dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] + splits <- as.multiPhylo(as.Splits(tree)) + characters <- as.multiPhylo(dataset) + + support <- rowSums(vapply(characters, function (char) { + trimmed <- KeepTip(splits, TipLabels(char)) + cbind(mi = MutualClusteringInfo(char, trimmed), + possible = ClusteringEntropy(trimmed)) + }, matrix(NA_real_, length(splits), 2)), dims = 2) + + ret <- support[, 1] / support[, 2] + # Return: + structure(ret, weighted.mean = weighted.mean(ret, support[, 2])) +} + +#' @rdname SiteConcordance +#' @details +#' `QuartetConcordance()` is the proportion of quartets (sets of four leaves) +#' that are decisive for a split which are also concordant with it +#' (the site concordance factor \insertCite{Minh2020}{TreeSearch}). +#' For example, a quartet with the characters `0 0 0 1` is not decisive, as +#' all relationships between those leaves are equally parsimonious. +#' But a quartet with characters `0 0 1 1` is decisive, and is concordant +#' with any tree that groups the first two leaves together to the exclusion +#' of the second. +#' +#' By default, the reported value weights each site by the number of quartets +#' it is decisive for. This value can be interpreted as the proportion of +#' all decisive quartets that are concordant with a split. +#' If `weight = FALSE`, the reported value is the mean of the concordance +#' value for each site. +#' Consider a split associated with two sites: +#' one that is concordant with 25% of 96 decisive quartets, and +#' a second that is concordant with 75% of 4 decisive quartets. +#' If `weight = TRUE`, the split concordance will be 24 + 3 / 96 + 4 = 27%. +#' If `weight = FALSE`, the split concordance will be mean(75%, 25%) = 50%. +#' +#' `QuartetConcordance()` is computed exactly, using all quartets, where as +#' other implementations (e.g. IQ-TREE) follow +#' \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample +#' of quartets for a faster, if potentially less accurate, computation. +#' Ambiguous and inapplicable tokens are treated as containing no grouping +#' information (i.e. `(02)` or `-` are each treated as `?`). +#' @return +#' `QuartetConcordance(return = "edge")` returns a numeric vector giving the +#' concordance index at each split across all sites; names specify the number of +#' each corresponding split in `tree`. +#' +#' `QuartetConcordance(return = "char")` returns a numeric vector giving the +#' concordance index calculated at each site, averaged across all splits. +#' +#' @param weight Logical specifying whether to weight sites according to the +#' number of quartets they are decisive for. #' @importFrom ape keep.tip #' @importFrom cli cli_progress_bar cli_progress_update #' @importFrom utils combn #' @importFrom TreeTools as.Splits PhyDatToMatrix TipLabels -#' @name SiteConcordance -#' @family split support functions #' @export -QuartetConcordance <- function (tree, dataset = NULL, weight = TRUE) { +QuartetConcordance <- function( + tree, + dataset = NULL, + weight = TRUE, + return = "edge" +) { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) @@ -109,120 +608,131 @@ QuartetConcordance <- function (tree, dataset = NULL, weight = TRUE) { logical(NTip(dataset))) characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) + charLevels <- attr(dataset, "allLevels") + isAmbig <- rowSums(attr(dataset, "contrast")) > 1 + isInapp <- charLevels == "-" + nonGroupingLevels <- charLevels[isAmbig | isInapp] + characters[characters %in% nonGroupingLevels] <- NA + + charInt <- `mode<-`(characters, "integer") + raw_counts <- quartet_concordance(logiSplits, charInt) + + num <- raw_counts$concordant + den <- raw_counts$decisive + options <- c("character", "site", "default") + return <- options[[pmatch(tolower(trimws(return)), options, + nomatch = length(options))]] - cli_progress_bar(name = "Quartet concordance", total = dim(logiSplits)[[2]]) - setNames(apply(logiSplits, 2, function (split) { - cli_progress_update(1, .envir = parent.frame(2)) - quarts <- apply(characters, 2, function (char) { - tab <- table(split, char) - nCol <- dim(tab)[[2]] - if (nCol > 1L) { - # Consider the case - # split 0 1 2 - # FALSE 2 3 0 - # TRUE 0 4 2 - # - # Concordant quartets with bin i = 1 (character 0) and bin j = 2 - # (character 1) include any combination of the two taxa from - # [FALSE, 0], and the two taxa chosen from the four in [TRUE, 1], - # i.e. choose(2, 2) * choose(4, 2) - concordant <- sum(vapply(seq_len(nCol), function (i) { - inBinI <- tab[1, i] - iChoices <- choose(inBinI, 2) - sum(vapply(seq_len(nCol)[-i], function (j) { - inBinJ <- tab[2, j] - iChoices * choose(inBinJ, 2) - }, 1)) - }, 1)) - - # To be discordant, we must select a pair of taxa from TT and from FF; - # and the character states must group each T with an F - # e.g. T0 T1 F0 F1 - # T0 T1 F0 F2 would not be discordant - just uninformative - discordant <- sum(apply(combn(nCol, 2), 2, function (ij) prod(tab[, ij]))) - - # Only quartets that include two T and two F can be decisive - # Quartets must also include two pairs of characters - decisive <- concordant + discordant - - # Return the numerator and denominatory of equation 2 in - # Minh et al. 2020 - c(concordant, decisive) - } else { - c(0L, 0L) - } - }) + + if (return == "default") { if (isTRUE(weight)) { - quartSums <- rowSums(quarts) - ifelse(is.nan(quartSums[[2]]), NA_real_, quartSums[[1]] / quartSums[[2]]) + # Sum numerator and denominator across sites (columns), then divide + # This matches weighted.mean(num/den, den) == sum(num) / sum(den) + split_sums_num <- rowSums(num) + split_sums_den <- rowSums(den) + ret <- ifelse( + split_sums_den == 0, + NA_real_, + split_sums_num / split_sums_den + ) } else { - mean(ifelse(is.nan(quarts[2, ]), NA_real_, quarts[1, ] / quarts[2, ]), - na.rm = TRUE) + # Mean of ratios per site + # Avoid division by zero (0/0 -> NaN -> NA handled by na.rm) + ratios <- num / den + # Replace NaN/Inf with NA for rowMeans calculation + ratios[!is.finite(ratios)] <- NA + ret <- rowMeans(ratios, na.rm = TRUE) } - }), names(splits)) -} -#' @importFrom TreeDist Entropy -.Entropy <- function (...) { - Entropy(c(...) / sum(...)) + setNames(ret, names(splits)) + } else { + # return = "char" + p <- num / den + if (isTRUE(weight)) { + vapply( + seq_len(dim(num)[[2]]), + function(i) { + weighted.mean(num[, i] / den[, i], den[, i]) + }, + double(1) + ) + } else { + vapply( + seq_len(dim(num)[[2]]), + function(i) { + mean(num[den[, i] > 0, i] / den[den[, i] > 0, i]) + }, + double(1) + ) + } + } } -#' @rdname SiteConcordance -#' @importFrom TreeTools Subsplit -#' @importFrom stats setNames -#' @export -ClusteringConcordance <- function (tree, dataset) { - if (is.null(dataset)) { - warning("Cannot calculate concordance without `dataset`.") - return(NULL) - } - dataset <- dataset[TipLabels(tree)] - splits <- as.logical(as.Splits(tree)) - - at <- attributes(dataset) - cont <- at[["contrast"]] - if ("-" %in% colnames(cont)) { - cont[cont[, "-"] > 0, ] <- 1 +#' @importFrom fastmap fastmap +.ExpectedMICache <- fastmap() + +# @param a must be a vector of length <= 2 +# @param b may be longer +#' @importFrom base64enc base64encode +.ExpectedMI <- function(a, b) { + if (length(a) < 2 || length(b) < 2) { + 0 + } else { + key <- base64enc::base64encode(mi_key(a, b)) + if (.ExpectedMICache$has(key)) { + .ExpectedMICache$get(key) + } else { + ret <- expected_mi(a, b) + + # Cache: + .ExpectedMICache$set(key, ret) + # Return: + ret + } } - ambiguous <- rowSums(cont) != 1 - - mat <- matrix(unlist(dataset), length(dataset), byrow = TRUE) - mat[mat %in% which(ambiguous)] <- NA - mat <- apply(mat, 2, function (x) { - uniques <- table(x) == 1 - x[x %in% names(uniques[uniques])] <- NA - x - }) - - h <- apply(mat, 2, function (char) { - aChar <- !is.na(char) - ch <- char[aChar] - hChar <- .Entropy(table(ch)) - h <- apply(splits[, aChar], 1, function (spl) { - c(hSpl = .Entropy(table(spl)), hJoint = .Entropy(table(ch, spl))) - }) - - cbind(hSum = hChar + h["hSpl", ], joint = h["hJoint", ]) - }) - - splitI <- seq_len(dim(splits)[1]) - both <- rowSums(h[splitI, at[["index"]], drop = FALSE]) - joint <- rowSums(h[-splitI, at[["index"]], drop = FALSE]) - mi <- both - joint - - # Return: - setNames(mi / joint, rownames(splits)) +} + +#' Re-zero a value by normalization +#' @param value value ranging from zero to one +#' @param zero new value to set as zero +#' @keywords internal +.Rezero <- function(value, zero) { + (value - zero) / (1 - zero) } #' @rdname SiteConcordance +#' +#' @details +#' `PhylogeneticConcordance()` treats each character in `dataset` as a +#' phylogenetic hypothesis and measures the extent to which it supports the +#' splits of `tree`. Each character is first interpreted as a tree (or set of +#' trees) in which taxa sharing the same token form a clade. Only splits for +#' which the character contains at least four relevant taxa can contribute +#' information. +#' +#' For each split, the function identifies which characters could potentially +#' support that split (i.e. those for which the induced subtrees contain +#' informative structure), and among these, which characters are actually +#' compatible with the split. The concordance value for each split is the +#' proportion of informative characters that support it. +#' A value of 1 indicates that all characters informative for that subset of +#' taxa support the split; a value of 0 indicates that none do. Characters that +#' contain only ambiguous or uninformative states for the relevant taxa do not +#' affect the result. +#' +#' @return `PhylogeneticConcordance()` returns a numeric vector giving the +#' phylogenetic information of each split in `tree`, named according to the +#' split's internal numbering. +#' #' @importFrom TreeTools as.multiPhylo CladisticInfo CompatibleSplits +#' MatchStrings #' @export -PhylogeneticConcordance <- function (tree, dataset) { +PhylogeneticConcordance <- function(tree, dataset) { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) } - dataset <- dataset[TipLabels(tree)] + dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] splits <- as.Splits(tree) if (is.null(names(splits))) { names(splits) <- paste0("sp", seq_along(splits)) @@ -253,38 +763,26 @@ PhylogeneticConcordance <- function (tree, dataset) { } #' @rdname SiteConcordance -# Mutual clustering information of each split with the split implied by each character -#' @importFrom TreeDist ClusteringEntropy MutualClusteringInfo -#' @export -MutualClusteringConcordance <- function (tree, dataset) { - if (is.null(dataset)) { - warning("Cannot calculate concordance without `dataset`.") - return(NULL) - } - dataset <- dataset[TipLabels(tree)] - splits <- as.multiPhylo(as.Splits(tree)) - characters <- as.multiPhylo(dataset) - - support <- rowSums(vapply(characters, function (char) { - trimmed <- lapply(splits, keep.tip, TipLabels(char)) - cbind(mi = MutualClusteringInfo(char, trimmed), - possible = ClusteringEntropy(trimmed)) - }, matrix(NA_real_, length(splits), 2)), dims = 2) - - # Return: - support[, 1] / support[, 2] -} - -#' @rdname SiteConcordance -#' @importFrom TreeTools as.multiPhylo +#' @details +#' `SharedPhylogeneticConcordance()` treats each character as a simple tree. +#' Each token in the character corresponds to a node whose pendant edges are the +#' taxa with that token. +#' The Shared Phylogenetic Concordance for each character in `dataset` is then +#' the Shared Phylogenetic Information \insertCite{Smith2020}{TreeSearch} of +#' this tree and `tree`. +#' @return `SharedPhylogeneticConcordance()` returns the shared phylogenetic +#' concordance of each character in `dataset` with `tree`. +#' The attribute `weighted.mean` gives the mean value, weighted by the +#' information content of each character. +#' @importFrom TreeTools as.multiPhylo MatchStrings #' @importFrom TreeDist ClusteringInfo SharedPhylogeneticInfo #' @export -SharedPhylogeneticConcordance <- function (tree, dataset) { +SharedPhylogeneticConcordance <- function(tree, dataset) { if (is.null(dataset)) { warning("Cannot calculate concordance without `dataset`.") return(NULL) } - dataset <- dataset[TipLabels(tree)] + dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] splits <- as.multiPhylo(as.Splits(tree)) characters <- as.multiPhylo(dataset) @@ -294,23 +792,24 @@ SharedPhylogeneticConcordance <- function (tree, dataset) { possible = ClusteringInfo(trimmed)) }, matrix(NA_real_, length(splits), 2)), dims = 2) + ret <- support[, 1] / support[, 2] # Return: - support[, 1] / support[, 2] + structure(ret, weighted.mean = weighted.mean(ret, support[, 2])) } #' Evaluate the concordance of information between a tree and a dataset -#' +#' #' Details the amount of information in a phylogenetic dataset that is #' consistent with a specified phylogenetic tree, and the signal:noise #' ratio of the character matrix implied if the tree is true. -#' +#' #' Presently restricted to datasets whose characters contain a maximum of #' two parsimony-informative states. -#' +#' #' @return `ConcordantInformation()` returns a named vector with elements: -#' +#' #' - `informationContent`: cladistic information content of `dataset` -#' - `signal`, `noise`: amount of cladistic information that represents +#' - `signal`, `noise`: amount of cladistic information that represents #' phylogenetic signal and noise, according to `tree` #' - `signalToNoise`: the implied signal:noise ratio of `dataset` #' - `treeInformation`: the cladistic information content of a bifurcating tree @@ -322,7 +821,7 @@ SharedPhylogeneticConcordance <- function (tree, dataset) { #' - `ignored`: information content of characters whose signal and noise could #' not be calculated (too many states) and so are not included in the totals #' above. -#' +#' #' @inheritParams TreeTools::Renumber #' @inheritParams MaximizeParsimony #' @examples @@ -330,18 +829,18 @@ SharedPhylogeneticConcordance <- function (tree, dataset) { #' myMatrix <- congreveLamsdellMatrices[[10]] #' ConcordantInformation(TreeTools::NJTree(myMatrix), myMatrix) #' @template MRS -#' @importFrom TreeTools Log2UnrootedMult Log2Unrooted +#' @importFrom TreeTools Log2UnrootedMult Log2Unrooted MatchStrings #' @export -ConcordantInformation <- function (tree, dataset) { - dataset <- dataset[TipLabels(tree)] +ConcordantInformation <- function(tree, dataset) { + dataset <- dataset[MatchStrings(TipLabels(tree), names(dataset))] originalInfo <- sum(apply(PhyDatToMatrix(dataset), 2, CharacterInformation)) dataset <- PrepareDataProfile(dataset) - + extraSteps <- CharacterLength(tree, dataset, compress = TRUE) - MinimumLength(dataset, compress = TRUE) chars <- matrix(unlist(dataset), attr(dataset, "nr")) ambiguousToken <- which(attr(dataset, "allLevels") == "?") - asSplits <- apply(chars, 1, function (x) { + asSplits <- apply(chars, 1, function(x) { ret <- table(x) if (length(ambiguousToken) != 0) { ret[names(ret) != ambiguousToken] @@ -368,8 +867,7 @@ ConcordantInformation <- function (tree, dataset) { }, double(1)) noise <- ic - signal noise[noise < sqrt(.Machine[["double.eps"]])] <- 0 - - + index <- attr(dataset, "index") if (any(is.na(signal))) { na <- is.na(signal) @@ -384,7 +882,7 @@ ConcordantInformation <- function (tree, dataset) { totalNoise <- sum(noise[index], na.rm = TRUE) totalSignal <- sum(signal[index], na.rm = TRUE) signalNoise <- totalSignal / totalNoise - + infoNeeded <- Log2Unrooted(length(dataset)) infoOverkill <- totalInfo / infoNeeded @@ -402,7 +900,7 @@ ConcordantInformation <- function (tree, dataset) { totalSignal <- sum(signal[index]) signalNoise <- totalSignal / totalNoise discarded = 0 - + infoNeeded <- Log2Unrooted(length(dataset)) infoOverkill <- totalInfo / infoNeeded discarded <- originalInfo - totalInfo @@ -419,22 +917,23 @@ ConcordantInformation <- function (tree, dataset) { signif(infoNeeded), " needed. ", "S:N = ", signif(signalNoise), "\n") } - + # Return: - c(informationContent = totalInfo, + c( + informationContent = totalInfo, signal = totalSignal, noise = totalNoise, - signalToNoise = signalNoise, - + signalToNoise = signalNoise, + treeInformation = infoNeeded, matrixToTree = infoOverkill, ignored = discarded - ) + ) } #' @rdname ConcordantInformation #' @export -Evaluate <- function (tree, dataset) { +Evaluate <- function(tree, dataset) { .Deprecated("ConcordantInformation()") ConcordantInformation(tree, dataset) } diff --git a/R/Consistency.R b/R/Consistency.R index bd695f758..2aeb4adbc 100644 --- a/R/Consistency.R +++ b/R/Consistency.R @@ -1,4 +1,4 @@ -#' Consistency / retention "indices" +#' Consistency and retention "indices" #' #' `Consistency()` calculates the consistency "index" and retention index #' \insertCite{Farris1989}{TreeSearch} @@ -19,7 +19,7 @@ #' Note that as the possible values of the consistency index do not range from #' zero to one, it is not an index in the mathematical sense of the term. #' Shortcomings of this measure are widely documented -#' \insertCite{Archie1989,Brooks1986,Steell2023}{TreeSearch}. +#' \insertCite{Archie1989,Brooks1986,Steell2025}{TreeSearch}. #' #' The maximum length of a character (see [`MaximumLength()`]) is the #' number of steps in a parsimonious reconstruction on the longest possible tree @@ -35,7 +35,7 @@ #' possible values runs from zero (least consistent) to one #' (perfectly consistent). #' -#' The **relative homoplasy index** \insertCite{Steell2023}{TreeSearch} is +#' The **relative homoplasy index** \insertCite{Steell2025}{TreeSearch} is #' the ratio of the observed excess tree length to the excess tree length #' due to chance, taken as the median score of a character when the leaves #' of the given tree are randomly shuffled. @@ -45,8 +45,8 @@ #' default treatment in [`TreeLength()`]. #' #' @param nRelabel Integer specifying how many times to relabel leaves when -#' estimating null tree length for \acronym{RHI} calculation. -#' \insertCite{Steell2023;textual}{TreeSearch} recommend 1000, but suggest that +#' computing MCMC estimate of null tree length for \acronym{RHI} calculation. +#' \insertCite{Steell2025;textual}{TreeSearch} recommend 1000, but suggest that #' 100 may suffice. #' If zero (the default), the \acronym{RHI} is not calculated. #' @inheritParams CharacterLength diff --git a/R/RcppExports.R b/R/RcppExports.R index 33cb4a90f..a2bc7abe5 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,14 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +expected_mi <- function(ni, nj) { + .Call(`_TreeSearch_expected_mi`, ni, nj) +} + +mi_key <- function(ni, nj) { + .Call(`_TreeSearch_mi_key`, ni, nj) +} + preorder_morphy <- function(edge, MorphyHandl) { .Call(`_TreeSearch_preorder_morphy`, edge, MorphyHandl) } @@ -17,6 +25,10 @@ morphy_profile <- function(edge, MorphyHandls, weight, sequence, profiles, targe .Call(`_TreeSearch_morphy_profile`, edge, MorphyHandls, weight, sequence, profiles, target) } +quartet_concordance <- function(splits, characters) { + .Call(`_TreeSearch_quartet_concordance`, splits, characters) +} + nni <- function(edge, randomEdge, whichSwitch) { .Call(`_TreeSearch_nni`, edge, randomEdge, whichSwitch) } diff --git a/benchmark/bench-iw.R b/benchmark/bench-iw.R index 0ebc566c8..aad034a42 100644 --- a/benchmark/bench-iw.R +++ b/benchmark/bench-iw.R @@ -1,4 +1,10 @@ library("TreeTools") + +tmp_lib <- tempfile(pattern = "lib") +dir.create(tmp_lib) +devtools::install(args = paste0("--library=", tmp_lib)) +library("TreeSearch", lib.loc = tmp_lib) + data("congreveLamsdellMatrices", package = "TreeSearch") dataset <- congreveLamsdellMatrices[[42]] tree <- AdditionTree(dataset) |> RenumberTips(names(dataset)) |> RootTree(1) diff --git a/benchmark/bench-quartet-conc.R b/benchmark/bench-quartet-conc.R new file mode 100644 index 000000000..46a959ffd --- /dev/null +++ b/benchmark/bench-quartet-conc.R @@ -0,0 +1,18 @@ +library("TreeTools") + +tmp_lib <- tempfile(pattern = "lib") +dir.create(tmp_lib) +devtools::install(args = paste0("--library=", tmp_lib)) +library("TreeSearch", lib.loc = tmp_lib) + +data("congreveLamsdellMatrices", package = "TreeSearch") +dataset <- congreveLamsdellMatrices[[42]] +someNA <- PhyDatToMatrix(dataset) +someNA[sample.int(length(someNA), 444)] <- "?" +someNA <- MatrixToPhyDat(someNA) +tree <- TreeSearch::referenceTree + +bench::mark( + QuartetConcordance(tree, dataset), + QuartetConcordance(tree, someNA), check = FALSE +) diff --git a/codemeta.json b/codemeta.json index c5d4941cc..77696cabf 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,13 +8,13 @@ "codeRepository": "https://github.com/ms609/TreeSearch/", "issueTracker": "https://github.com/ms609/TreeSearch/issues/", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "1.7.0", + "version": "1.8.0", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", "url": "https://r-project.org" }, - "runtimePlatform": "R version 4.5.1 (2025-06-13)", + "runtimePlatform": "R version 4.5.2 (2025-10-31 ucrt)", "provider": { "@id": "https://cran.r-project.org", "@type": "Organization", @@ -174,6 +174,18 @@ "version": ">= 4.0" }, "2": { + "@type": "SoftwareApplication", + "identifier": "abind", + "name": "abind", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=abind" + }, + "3": { "@type": "SoftwareApplication", "identifier": "ape", "name": "ape", @@ -186,7 +198,19 @@ }, "sameAs": "https://CRAN.R-project.org/package=ape" }, - "3": { + "4": { + "@type": "SoftwareApplication", + "identifier": "base64enc", + "name": "base64enc", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=base64enc" + }, + "5": { "@type": "SoftwareApplication", "identifier": "cli", "name": "cli", @@ -199,7 +223,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=cli" }, - "4": { + "6": { "@type": "SoftwareApplication", "identifier": "cluster", "name": "cluster", @@ -211,7 +235,19 @@ }, "sameAs": "https://CRAN.R-project.org/package=cluster" }, - "5": { + "7": { + "@type": "SoftwareApplication", + "identifier": "colorspace", + "name": "colorspace", + "provider": { + "@id": "https://cran.r-project.org", + "@type": "Organization", + "name": "Comprehensive R Archive Network (CRAN)", + "url": "https://cran.r-project.org" + }, + "sameAs": "https://CRAN.R-project.org/package=colorspace" + }, + "8": { "@type": "SoftwareApplication", "identifier": "fastmap", "name": "fastmap", @@ -223,7 +259,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=fastmap" }, - "6": { + "9": { "@type": "SoftwareApplication", "identifier": "fastmatch", "name": "fastmatch", @@ -236,7 +272,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=fastmatch" }, - "7": { + "10": { "@type": "SoftwareApplication", "identifier": "fs", "name": "fs", @@ -248,7 +284,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=fs" }, - "8": { + "11": { "@type": "SoftwareApplication", "identifier": "future", "name": "future", @@ -260,7 +296,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=future" }, - "9": { + "12": { "@type": "SoftwareApplication", "identifier": "PlotTools", "name": "PlotTools", @@ -272,7 +308,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=PlotTools" }, - "10": { + "13": { "@type": "SoftwareApplication", "identifier": "promises", "name": "promises", @@ -284,7 +320,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=promises" }, - "11": { + "14": { "@type": "SoftwareApplication", "identifier": "protoclust", "name": "protoclust", @@ -296,7 +332,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=protoclust" }, - "12": { + "15": { "@type": "SoftwareApplication", "identifier": "Rcpp", "name": "Rcpp", @@ -308,7 +344,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=Rcpp" }, - "13": { + "16": { "@type": "SoftwareApplication", "identifier": "Rdpack", "name": "Rdpack", @@ -321,7 +357,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=Rdpack" }, - "14": { + "17": { "@type": "SoftwareApplication", "identifier": "Rogue", "name": "Rogue", @@ -334,7 +370,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=Rogue" }, - "15": { + "18": { "@type": "SoftwareApplication", "identifier": "shiny", "name": "shiny", @@ -347,7 +383,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=shiny" }, - "16": { + "19": { "@type": "SoftwareApplication", "identifier": "shinyjs", "name": "shinyjs", @@ -359,12 +395,12 @@ }, "sameAs": "https://CRAN.R-project.org/package=shinyjs" }, - "17": { + "20": { "@type": "SoftwareApplication", "identifier": "stats", "name": "stats" }, - "18": { + "21": { "@type": "SoftwareApplication", "identifier": "stringi", "name": "stringi", @@ -376,7 +412,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=stringi" }, - "19": { + "22": { "@type": "SoftwareApplication", "identifier": "TreeDist", "name": "TreeDist", @@ -389,7 +425,7 @@ }, "sameAs": "https://CRAN.R-project.org/package=TreeDist" }, - "20": { + "23": { "@type": "SoftwareApplication", "identifier": "TreeTools", "name": "TreeTools", @@ -404,7 +440,7 @@ }, "SystemRequirements": "C++17" }, - "fileSize": "4584.9KB", + "fileSize": "4632.484KB", "citation": [ { "@type": "SoftwareSourceCode", @@ -476,7 +512,7 @@ ], "name": "{TreeSearch}: phylogenetic analysis with discrete character data", "identifier": "10.5281/zenodo.1042590", - "description": "R package version 1.7.0", + "description": "R package version 1.8.0", "@id": "https://doi.org/10.5281/zenodo.1042590", "sameAs": "https://doi.org/10.5281/zenodo.1042590" }, diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index d82e34841..b26a7a984 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -509,6 +509,13 @@ @article{Smith2020 doi = {10.1093/bioinformatics/btaa614} } +@article{SmithConc, + title = {Which characters support which clades? Exploring the distribution of phylogenetic signal using concordant information}, + author = {Smith, Martin R.}, + journal = {Forthcoming}, + year = {2026} +} + @article{SmithCons, title = {Using information theory to detect rogue taxa and improve consensus trees}, author = {Smith, Martin R.}, @@ -576,7 +583,7 @@ @article{Steel1996 number = {4} } -@article{Steell2023, +@article{Steell2025, title = {Revealing Patterns of Homoplasy in Discrete Phylogenetic Datasets with a Cross-Comparable Index}, author = {Steell, Elizabeth M. and Hsiang, Allison Y. and Field, Daniel J.}, year = {2025}, diff --git a/man/ConcordanceTable.Rd b/man/ConcordanceTable.Rd new file mode 100644 index 000000000..5239f1bcb --- /dev/null +++ b/man/ConcordanceTable.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Concordance.R +\name{ConcordanceTable} +\alias{ConcordanceTable} +\title{Plot concordance table} +\usage{ +ConcordanceTable( + tree, + dataset, + Col = QACol, + largeClade = 0, + xlab = "Edge", + ylab = "Character", + ... +) +} +\arguments{ +\item{tree}{A tree of class \code{\link[ape:read.tree]{phylo}}.} + +\item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class +\code{phyDat}, whose names correspond to the labels of any accompanying tree. +Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}()}. +Additive (ordered) characters can be handled using +\code{\link[TreeTools]{Decompose}()}.} + +\item{Col}{Function that takes vectors \code{amount} and \code{quality} and returns +a vector of colours. \link{QCol} colours by data quality (concordance); +\link{QACol} by quality and amount of information.} + +\item{largeClade}{Integer; if greater than 1, vertical lines will be drawn +at edges whose descendants are both contain more than \code{largeClade} leaves.} + +\item{xlab}{Character giving a label for the x axis.} + +\item{ylab}{Character giving a label for the y axis.} + +\item{\dots}{Arguments to \code{abline}, to control the appearance of vertical +lines marking important edges.} +} +\value{ +\code{ConcordanceTable()} invisibly returns an named list containing: +\itemize{ +\item \code{"info"}: The amount of information in each character-edge pair, in bits; +\item \code{"relInfo"}: The information, normalized to the most information-rich pair; +\item \code{"quality"}: The normalized mutual information of the pair; +\item \code{"col"}: The colours used to plot the table. +} +} +\description{ +\code{ConcordanceTable()} plots a concordance table +\insertCite{SmithConc}{TreeSearch}. +} +\examples{ +# Load data and tree +data("congreveLamsdellMatrices", package = "TreeSearch") +dataset <- congreveLamsdellMatrices[[1]][, 1:20] +tree <- referenceTree + +# Plot tree and identify nodes +library("TreeTools", quietly = TRUE) +plot(tree) +nodeIndex <- as.integer(rownames(as.Splits(tree))) +nodelabels(seq_along(nodeIndex), nodeIndex, adj = c(2, 1), + frame = "none", bg = NULL) +QALegend(where = c(0.1, 0.4, 0.1, 0.3)) + +# View information shared by characters and edges +ConcordanceTable(tree, dataset, largeClade = 3, col = 2, lwd = 3) +axis(1) +axis(2) + +# Visualize dataset +image(t(`mode<-`(PhyDatToMatrix(dataset), "numeric")), axes = FALSE, + xlab = "Leaf", ylab = "Character") +} +\references{ +\insertAllCited{} +} diff --git a/man/Consistency.Rd b/man/Consistency.Rd index f0fb87a77..73ff05282 100644 --- a/man/Consistency.Rd +++ b/man/Consistency.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/Consistency.R \name{Consistency} \alias{Consistency} -\title{Consistency / retention "indices"} +\title{Consistency and retention "indices"} \usage{ Consistency(dataset, tree, nRelabel = 0, compress = FALSE) } @@ -16,8 +16,8 @@ Additive (ordered) characters can be handled using \item{tree}{A tree of class \code{\link[ape:read.tree]{phylo}}.} \item{nRelabel}{Integer specifying how many times to relabel leaves when -estimating null tree length for \acronym{RHI} calculation. -\insertCite{Steell2023;textual}{TreeSearch} recommend 1000, but suggest that +computing MCMC estimate of null tree length for \acronym{RHI} calculation. +\insertCite{Steell2025;textual}{TreeSearch} recommend 1000, but suggest that 100 may suffice. If zero (the default), the \acronym{RHI} is not calculated.} @@ -53,7 +53,7 @@ the tree is optimal. Note that as the possible values of the consistency index do not range from zero to one, it is not an index in the mathematical sense of the term. Shortcomings of this measure are widely documented -\insertCite{Archie1989,Brooks1986,Steell2023}{TreeSearch}. +\insertCite{Archie1989,Brooks1986,Steell2025}{TreeSearch}. The maximum length of a character (see \code{\link[=MaximumLength]{MaximumLength()}}) is the number of steps in a parsimonious reconstruction on the longest possible tree @@ -69,7 +69,7 @@ retention indices; it rescales the consistency index such that its range of possible values runs from zero (least consistent) to one (perfectly consistent). -The \strong{relative homoplasy index} \insertCite{Steell2023}{TreeSearch} is +The \strong{relative homoplasy index} \insertCite{Steell2025}{TreeSearch} is the ratio of the observed excess tree length to the excess tree length due to chance, taken as the median score of a character when the leaves of the given tree are randomly shuffled. diff --git a/man/ExpectedLength.Rd b/man/ExpectedLength.Rd index 1b6fc4a6b..5c7cc3659 100644 --- a/man/ExpectedLength.Rd +++ b/man/ExpectedLength.Rd @@ -16,8 +16,8 @@ Additive (ordered) characters can be handled using \item{tree}{A tree of class \code{\link[ape:read.tree]{phylo}}.} \item{nRelabel}{Integer specifying how many times to relabel leaves when -estimating null tree length for \acronym{RHI} calculation. -\insertCite{Steell2023;textual}{TreeSearch} recommend 1000, but suggest that +computing MCMC estimate of null tree length for \acronym{RHI} calculation. +\insertCite{Steell2025;textual}{TreeSearch} recommend 1000, but suggest that 100 may suffice. If zero (the default), the \acronym{RHI} is not calculated.} diff --git a/man/QACol.Rd b/man/QACol.Rd new file mode 100644 index 000000000..bb686a870 --- /dev/null +++ b/man/QACol.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Concordance.R +\name{QACol} +\alias{QACol} +\alias{QCol} +\alias{QALegend} +\title{Generate colour to depict the amount and quality of observations} +\usage{ +QACol(amount, quality) + +QCol(amount, quality) + +QALegend(where = c(0.1, 0.3, 0.1, 0.3), n = 5, Col = QACol) +} +\arguments{ +\item{amount}{Numeric vector of values between 0 and 1, denoting the relative +amount of information} + +\item{quality}{Numeric vector of values between -1 and 1, denoting the +quality of observations, where 0 is neutral.} + +\item{where}{Location of legend, passed to \code{par(fig = where)}} + +\item{n}{Integer vector giving number of cells to plot in swatch for +\code{quality} and \code{amount}.} + +\item{Col}{Function that takes vectors \code{amount} and \code{quality} and returns +a vector of colours. \link{QCol} colours by data quality (concordance); +\link{QACol} by quality and amount of information.} +} +\value{ +\code{QACol()} returns an RGB hex code for a colour, where lighter colours +correspond to entries with a higher \code{amount}; unsaturated colours denote +a neutral \code{quality}; and red/cyan colours denote low/high \code{quality}. + +\code{QCol()} returns an RGB hex code for a colour, where darker, +unsaturated colours denote a neutral \code{quality}; +and red/cyan colours denote low/high \code{quality}. \code{amount} is ignored. +} +\description{ +Generate colour to depict the amount and quality of observations +} +\examples{ +amount <- runif(80, 0, 1) +quality <- runif(80, -1, 1) +plot(amount, quality, col = QACol(amount, quality), pch = 15) +abline(h = 0) +} +\author{ +\href{https://smithlabdurham.github.io/}{Martin R. Smith} +(\href{mailto:martin.smith@durham.ac.uk}{martin.smith@durham.ac.uk}) +} diff --git a/man/SiteConcordance.Rd b/man/SiteConcordance.Rd index 9a27ac8ad..dbffef045 100644 --- a/man/SiteConcordance.Rd +++ b/man/SiteConcordance.Rd @@ -2,21 +2,21 @@ % Please edit documentation in R/Concordance.R \name{SiteConcordance} \alias{SiteConcordance} -\alias{QuartetConcordance} \alias{ClusteringConcordance} -\alias{PhylogeneticConcordance} \alias{MutualClusteringConcordance} +\alias{QuartetConcordance} +\alias{PhylogeneticConcordance} \alias{SharedPhylogeneticConcordance} -\title{Calculate site concordance factor} +\title{Concordance factors} \usage{ -QuartetConcordance(tree, dataset = NULL, weight = TRUE) +ClusteringConcordance(tree, dataset, return = "edge", normalize = TRUE) + +MutualClusteringConcordance(tree, dataset) -ClusteringConcordance(tree, dataset) +QuartetConcordance(tree, dataset = NULL, weight = TRUE, return = "edge") PhylogeneticConcordance(tree, dataset) -MutualClusteringConcordance(tree, dataset) - SharedPhylogeneticConcordance(tree, dataset) } \arguments{ @@ -28,17 +28,129 @@ Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}()}. Additive (ordered) characters can be handled using \code{\link[TreeTools]{Decompose}()}.} +\item{return}{Character specifying the summary to return. Options are: +\itemize{ +\item \code{"edge"}: average concordance of each tree split across all characters; +\item \code{"char"}: concordance of each character averaged over all splits, weighted +by each character's information content; +\item \code{"tree"}: an overall tree‑level concordance score; +\item \code{"all"}: a full array of MI components and normalized values for every +split–character pair. +} + +Matching is case‑insensitive and partial.} + +\item{normalize}{Controls how the \emph{expected} mutual information (the zero +point of the scale) is determined. +\itemize{ +\item \code{FALSE}: no chance correction; MI is scaled only by its maximum. +\item \code{TRUE}: subtract the \strong{analytical} expected MI for random association. +\item \verb{}: subtract an \strong{empirical} expected MI estimated from that +number of random trees. +} + +In all cases, 1 corresponds to the maximal attainable MI for the pair +(\code{hBest}), and 0 corresponds to the chosen expectation.} + \item{weight}{Logical specifying whether to weight sites according to the number of quartets they are decisive for.} } +\value{ +\code{ClusteringConcordance(return = "all")} returns a 3D array where each +slice corresponds to a character (site), each column to a tree split, and +each row to a different information measure. The \code{normalized} row gives the +normalized mutual information between each split-character pair, scaled so +that 1.0 corresponds to \code{hBest} (the theoretical maximum mutual information, +being the minimum of \code{hSplit} and \code{hChar}) and 0.0 corresponds to \code{miRand} +(the expected mutual information under random association). \code{hSplit} gives +the entropy (information content) of each split's bipartition; \code{hChar} gives +the entropy of each character's state distribution; \code{hJoint} gives the joint +entropy of the split-character confusion matrix; \code{mi} gives the raw mutual +information; and \code{n} records the number of informative observations. +Negative normalized values indicate observed mutual information below random +expectation. \code{NA} is returned when \code{hBest = 0} (no information potential). + +\code{ClusteringConcordance(return = "edge")} returns a vector where each element +corresponds to a split (an edge of the tree) and gives the normalized mutual +information between that split and the character data, averaged across all +characters. +When \code{normalize = TRUE} (default), values are scaled relative to random +expectation; when \code{FALSE}, raw mutual information normalized by \code{hBest} is +returned. + +\code{ClusteringConcordance(return = "char")} returns a vector where each element +corresponds to a character (site) and gives the entropy-weighted average +normalized mutual information between that character and all tree splits. +Characters with higher information content receive proportionally more weight +from splits that can potentially convey more information about them. + +\code{ClusteringConcordance(return = "tree")} returns a single value representing +the overall concordance between the tree topology and the character data. +This averages the fit of the best-matching split for each character. +This is included for completeness, though it is not clear that this is a useful +or meaningful measure. + +\code{MutualClusteringConcordance()} returns the mutual clustering +concordance of each character in \code{dataset} with \code{tree}. +The attribute \code{weighted.mean} gives the mean value, weighted by the +information content of each character. + +\code{QuartetConcordance(return = "edge")} returns a numeric vector giving the +concordance index at each split across all sites; names specify the number of +each corresponding split in \code{tree}. + +\code{QuartetConcordance(return = "char")} returns a numeric vector giving the +concordance index calculated at each site, averaged across all splits. + +\code{PhylogeneticConcordance()} returns a numeric vector giving the +phylogenetic information of each split in \code{tree}, named according to the +split's internal numbering. + +\code{SharedPhylogeneticConcordance()} returns the shared phylogenetic +concordance of each character in \code{dataset} with \code{tree}. +The attribute \code{weighted.mean} gives the mean value, weighted by the +information content of each character. +} \description{ -The site concordance factor \insertCite{Minh2020}{TreeSearch} is a measure -of the strength of support that the dataset presents for a given split in a -tree. +Concordance measures the strength of support that characters in a dataset +present for each split (=edge/branch) in a tree +\insertCite{Minh2020,SmithConc}{TreeSearch}. } \details{ +\code{ClusteringConcordance()} measures how well each tree split reflects the +grouping structure implied by each character +\insertCite{SmithConc}{TreeSearch}. +Characters and splits are treated as clusterings of taxa, and their agreement +is quantified using mutual information (MI). All reported values are scaled +so that 1 corresponds to the maximum possible mutual information for each +split–character pair (\code{hBest}). + +The \code{normalize} argument specifies how the zero point is defined. +\itemize{ +\item If \code{normalize = FALSE}, zero corresponds to \emph{zero} MI, without correcting +for the positive bias that arises because MI is rarely exactly zero in +finite samples. +\item If \code{normalize = TRUE}, the expected MI is computed using an analytical +approximation based on the distribution of character tokens. This is fast +and generally accurate for large trees (≈200+ taxa), but does not account +for correlation between splits. +\item If \code{normalize} is a positive integer \code{n}, the expected MI is estimated +empirically by fitting each character to \code{n} uniformly random trees and +averaging the resulting MI values. This Monte Carlo approach provides a +more accurate baseline for small trees, for which the analytical +approximation is biased. Monte Carlo standard errors are returned. +} + +\code{MutualClusteringConcordance()} provides a character‑wise summary that +emphasises each character’s best‑matching split(s). It treats each character +as a simple tree and computes the mutual clustering information between this +character‑tree and the supplied phylogeny. High values identify characters +whose signal is well represented anywhere in the tree, even if concentrated +on a single edge. + \code{QuartetConcordance()} is the proportion of quartets (sets of four leaves) -that are decisive for a split which are also concordant with it. +that are decisive for a split which are also concordant with it +(the site concordance factor \insertCite{Minh2020}{TreeSearch}). For example, a quartet with the characters \verb{0 0 0 1} is not decisive, as all relationships between those leaves are equally parsimonious. But a quartet with characters \verb{0 0 1 1} is decisive, and is concordant @@ -60,20 +172,45 @@ If \code{weight = FALSE}, the split concordance will be mean(75\%, 25\%) = 50\%. other implementations (e.g. IQ-TREE) follow \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample of quartets for a faster, if potentially less accurate, computation. +Ambiguous and inapplicable tokens are treated as containing no grouping +information (i.e. \code{(02)} or \code{-} are each treated as \verb{?}). + +\code{PhylogeneticConcordance()} treats each character in \code{dataset} as a +phylogenetic hypothesis and measures the extent to which it supports the +splits of \code{tree}. Each character is first interpreted as a tree (or set of +trees) in which taxa sharing the same token form a clade. Only splits for +which the character contains at least four relevant taxa can contribute +information. + +For each split, the function identifies which characters could potentially +support that split (i.e. those for which the induced subtrees contain +informative structure), and among these, which characters are actually +compatible with the split. The concordance value for each split is the +proportion of informative characters that support it. +A value of 1 indicates that all characters informative for that subset of +taxa support the split; a value of 0 indicates that none do. Characters that +contain only ambiguous or uninformative states for the relevant taxa do not +affect the result. -\strong{NOTE:} These functions are under development. They are incompletely -tested, and may change without notice. -Complete documentation and discussion will follow in due course. +\code{SharedPhylogeneticConcordance()} treats each character as a simple tree. +Each token in the character corresponds to a node whose pendant edges are the +taxa with that token. +The Shared Phylogenetic Concordance for each character in \code{dataset} is then +the Shared Phylogenetic Information \insertCite{Smith2020}{TreeSearch} of +this tree and \code{tree}. } \examples{ data("congreveLamsdellMatrices", package = "TreeSearch") dataset <- congreveLamsdellMatrices[[1]][, 1:20] -tree <- referenceTree -qc <- QuartetConcordance(tree, dataset) +tree <- TreeSearch::referenceTree + cc <- ClusteringConcordance(tree, dataset) +mcc <- MutualClusteringConcordance(tree, dataset) + +qc <- QuartetConcordance(tree, dataset) + pc <- PhylogeneticConcordance(tree, dataset) spc <- SharedPhylogeneticConcordance(tree, dataset) -mcc <- MutualClusteringConcordance(tree, dataset) oPar <- par(mar = rep(0, 4), cex = 0.8) # Set plotting parameters plot(tree) @@ -83,7 +220,7 @@ TreeTools::LabelSplits(tree, signif(cc, 3), cex = 0.8) par(oPar) # Restore plotting parameters # Write concordance factors to file -labels <- paste0(qc, "/", cc, "/", pc) # "/" is a valid delimiter +labels <- paste0(cc, "/", qc, "/", pc) # "/" is a valid delimiter # Identify the node that corresponds to each label whichNode <- match(TreeTools::NTip(tree) + 1:tree$Nnode, names(qc)) @@ -93,12 +230,19 @@ tree$node.label <- labels[whichNode] ape::write.tree(tree) # or write.nexus(tree, file = "mytree.nex") # Display correlation between concordance factors -pairs(cbind(qc, cc, pc, spc, mcc), asp = 1) +pairs(cbind(cc, mcc, qc, pc, spc), asp = 1) +data(congreveLamsdellMatrices) +myMatrix <- congreveLamsdellMatrices[[10]] +ClusteringConcordance(TreeTools::NJTree(myMatrix), myMatrix) } \references{ \insertAllCited{} } \seealso{ +\itemize{ +\item \code{\link[=Consistency]{Consistency()}} +} + Other split support functions: \code{\link{JackLabels}()}, \code{\link{Jackknife}()}, diff --git a/man/dot-Rezero.Rd b/man/dot-Rezero.Rd new file mode 100644 index 000000000..9fb869b79 --- /dev/null +++ b/man/dot-Rezero.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Concordance.R +\name{.Rezero} +\alias{.Rezero} +\title{Re-zero a value by normalization} +\usage{ +.Rezero(value, zero) +} +\arguments{ +\item{value}{value ranging from zero to one} + +\item{zero}{new value to set as zero} +} +\description{ +Re-zero a value by normalization +} +\keyword{internal} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index bbb5c9868..d5708bd8c 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -10,6 +10,30 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif +// expected_mi +double expected_mi(const IntegerVector& ni, const IntegerVector& nj); +RcppExport SEXP _TreeSearch_expected_mi(SEXP niSEXP, SEXP njSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const IntegerVector& >::type ni(niSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nj(njSEXP); + rcpp_result_gen = Rcpp::wrap(expected_mi(ni, nj)); + return rcpp_result_gen; +END_RCPP +} +// mi_key +RawVector mi_key(IntegerVector ni, IntegerVector nj); +RcppExport SEXP _TreeSearch_mi_key(SEXP niSEXP, SEXP njSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< IntegerVector >::type ni(niSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type nj(njSEXP); + rcpp_result_gen = Rcpp::wrap(mi_key(ni, nj)); + return rcpp_result_gen; +END_RCPP +} // preorder_morphy int preorder_morphy(IntegerMatrix edge, SEXP MorphyHandl); RcppExport SEXP _TreeSearch_preorder_morphy(SEXP edgeSEXP, SEXP MorphyHandlSEXP) { @@ -67,6 +91,18 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// quartet_concordance +List quartet_concordance(const LogicalMatrix splits, const IntegerMatrix characters); +RcppExport SEXP _TreeSearch_quartet_concordance(SEXP splitsSEXP, SEXP charactersSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const LogicalMatrix >::type splits(splitsSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix >::type characters(charactersSEXP); + rcpp_result_gen = Rcpp::wrap(quartet_concordance(splits, characters)); + return rcpp_result_gen; +END_RCPP +} // nni IntegerMatrix nni(const IntegerMatrix edge, const IntegerVector randomEdge, const IntegerVector whichSwitch); RcppExport SEXP _TreeSearch_nni(SEXP edgeSEXP, SEXP randomEdgeSEXP, SEXP whichSwitchSEXP) { diff --git a/src/TreeSearch-init.c b/src/TreeSearch-init.c index 729dc2a76..f054c9aa6 100644 --- a/src/TreeSearch-init.c +++ b/src/TreeSearch-init.c @@ -23,6 +23,11 @@ extern SEXP _TreeSearch_preorder_morphy_by_char(SEXP, SEXP); extern SEXP _TreeSearch_morphy_iw(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); extern SEXP _TreeSearch_morphy_profile(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +extern SEXP _TreeSearch_expected_mi(SEXP, SEXP); +extern SEXP _TreeSearch_mi_key(SEXP, SEXP); + +extern SEXP _TreeSearch_quartet_concordance(SEXP, SEXP); + static const R_CallMethodDef callMethods[] = { {"_R_wrap_mpl_new_Morphy", (DL_FUNC) &_R_wrap_mpl_new_Morphy, 0}, {"_R_wrap_mpl_delete_Morphy", (DL_FUNC) &_R_wrap_mpl_delete_Morphy, 1}, @@ -55,8 +60,14 @@ static const R_CallMethodDef callMethods[] = { // {"_TreeSearch_tbr_moves", (DL_FUNC) &_TreeSearch_tbr_moves, 1}, {"_TreeSearch_preorder_morphy", (DL_FUNC) &_TreeSearch_preorder_morphy, 2}, {"_TreeSearch_preorder_morphy_by_char", (DL_FUNC) &_TreeSearch_preorder_morphy_by_char, 2}, + {"_TreeSearch_morphy_iw", (DL_FUNC) &_TreeSearch_morphy_iw, 7}, {"_TreeSearch_morphy_profile", (DL_FUNC) &_TreeSearch_morphy_profile, 6}, + {"_TreeSearch_expected_mi", (DL_FUNC) &_TreeSearch_expected_mi, 2}, + {"_TreeSearch_mi_key", (DL_FUNC) &_TreeSearch_mi_key, 2}, + + {"_TreeSearch_quartet_concordance",(DL_FUNC) &_TreeSearch_quartet_concordance, 2}, + {"MORPHYLENGTH", (DL_FUNC) &MORPHYLENGTH, 4}, {"RANDOM_TREE", (DL_FUNC) &RANDOM_TREE, 1}, {"RANDOM_TREE_SCORE", (DL_FUNC) &RANDOM_TREE_SCORE, 2}, diff --git a/src/expected_mi.cpp b/src/expected_mi.cpp new file mode 100644 index 000000000..867b97b23 --- /dev/null +++ b/src/expected_mi.cpp @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include +#include +using namespace Rcpp; + +#define MAX_FACTORIAL_LOOKUP 8192 +static double log2_factorial_table[MAX_FACTORIAL_LOOKUP + 1]; +static const double LOG2_E = 1.4426950408889634; + +__attribute__((constructor)) +void initialize_factorial_cache() { + log2_factorial_table[0] = 0.0; + for (int i = 1; i <= MAX_FACTORIAL_LOOKUP; i++) { + log2_factorial_table[i] = log2_factorial_table[i - 1] + std::log2(i); + } +} + +// Fast lookup with bounds checking +inline double l2factorial(int n) { + if (n <= MAX_FACTORIAL_LOOKUP) { + return log2_factorial_table[n]; + } else { + return lgamma(n + 1) * LOG2_E; + } +} + +// ni and nj are vectors listing the number of entitites in each cluster +// [[Rcpp::export]] +double expected_mi(const IntegerVector &ni, const IntegerVector &nj) { + // ni = {a, N-a}; nj = counts of character states + const int a = ni[0]; + const int N = ni[0] + ni[1]; + if (a <= 0 || a >= N) return 0.0; // trivial split + + const double invN = 1.0 / static_cast(N); + const double log2N = std::log2(static_cast(N)); + const double log2a = std::log2(static_cast(a)); + const double log2Na = std::log2(static_cast(N - a)); + const double log2_denom = l2factorial(N) - l2factorial(a) - l2factorial(N - a); + + double emi = 0.0; + + for (int j = 0; j < nj.size(); ++j) { + int mj = nj[j]; + if (mj <= 0) continue; + + int kmin = std::max(0, a + mj - N); + int kmax = std::min(a, mj); + if (kmin > kmax) continue; + + const double log2mj = std::log2(static_cast(mj)); + + // compute P(K=kmin) + double log2P = (l2factorial(mj) - l2factorial(kmin) - l2factorial(mj - kmin)) + + (l2factorial(N - mj) - l2factorial(a - kmin) - l2factorial(N - mj - (a - kmin))) + - log2_denom; + double Pk = std::pow(2.0, log2P); + + for (int k = kmin; k <= kmax; ++k) { + if (Pk > 0.0) { + // contribution from inside the split + if (k > 0) { + double mi_in = std::log2(static_cast(k)) + log2N - (log2a + log2mj); + emi += (static_cast(k) * invN) * mi_in * Pk; + } + // contribution from outside the split + int kout = mj - k; + if (kout > 0) { + double mi_out = std::log2(static_cast(kout)) + log2N - (log2Na + log2mj); + emi += (static_cast(kout) * invN) * mi_out * Pk; + } + } + // Update P(k) → P(k+1) + if (k < kmax) { + double numer = static_cast((mj - k) * (a - k)); + double denom = static_cast((k + 1) * (N - mj - a + k + 1)); + Pk *= numer / denom; + } + } + } + + return emi; +} + +// [[Rcpp::export]] +RawVector mi_key(IntegerVector ni, IntegerVector nj) { + if (ni.size() != 2) { + Rcpp::stop("ni must be a vector of length 2."); + } + + std::vector ni_vals = {static_cast(ni[0]), + static_cast(ni[1])}; + if (ni_vals[0] > 65535 || ni_vals[1] > 65535) { + Rcpp::stop("ni values must be ≤ 65535."); + } + std::sort(ni_vals.begin(), ni_vals.end()); + + std::vector nj_vals; + for (int val : nj) { + if (val > 65535) { + Rcpp::stop("nj values must be ≤ 65535."); + } + nj_vals.push_back(static_cast(val)); + } + std::sort(nj_vals.begin(), nj_vals.end()); + + + // Total number of 16-bit ints + size_t n = 2 + nj_vals.size(); + RawVector key_raw(n * 2); + + // Write ni + key_raw[0] = ni_vals[0] >> 8; + key_raw[1] = ni_vals[0] & 0xFF; + key_raw[2] = ni_vals[1] >> 8; + key_raw[3] = ni_vals[1] & 0xFF; + + // Write nj + for (size_t i = 0; i < nj_vals.size(); ++i) { + key_raw[4 + i + i] = nj_vals[i] >> 8; + key_raw[4 + i + i + 1] = nj_vals[i] & 0xFF; + } + + return key_raw; +} diff --git a/src/quartet_concordance.cpp b/src/quartet_concordance.cpp new file mode 100644 index 000000000..dd3cbb495 --- /dev/null +++ b/src/quartet_concordance.cpp @@ -0,0 +1,82 @@ +#include +#include + +using namespace Rcpp; + +// [[Rcpp::export]] +List quartet_concordance(const LogicalMatrix splits, const IntegerMatrix characters) { + const int n_splits = splits.ncol(); + const int n_chars = characters.ncol(); + const int n_taxa = splits.nrow(); + + NumericMatrix concordant(n_splits, n_chars); + NumericMatrix decisive(n_splits, n_chars); + + // Pre-allocate buffers + std::vector n0(32, 0), n1(32, 0); + std::vector active_states; + active_states.reserve(32); + + for (int c = 0; c < n_chars; ++c) { + // Cache character column for memory locality + std::vector char_col(n_taxa); + for (int t = 0; t < n_taxa; ++t) char_col[t] = characters(t, c); + + for (int s = 0; s < n_splits; ++s) { + active_states.clear(); + + for (int t = 0; t < n_taxa; ++t) { + int state = char_col[t]; + if (IntegerVector::is_na(state)) continue; + + if (state >= (int)n0.size()) { + n0.resize(state + 1, 0); + n1.resize(state + 1, 0); + } + + if (n0[state] == 0 && n1[state] == 0) { + active_states.push_back(state); + } + + if (splits(t, s)) { + n1[state]++; + } else { + n0[state]++; + } + } + + if (active_states.size() >= 2) { + double total_choose_n1 = 0; + double sum_prod = 0; + double sum_sq_prod = 0; + + for (int state : active_states) { + double cnt1 = n1[state]; + total_choose_n1 += (cnt1 * (cnt1 - 1.0) / 2.0); + + double p = (double)n0[state] * cnt1; + sum_prod += p; + sum_sq_prod += p * p; + } + + double conc_val = 0; + for (int state : active_states) { + double cn0 = (double)n0[state] * (n0[state] - 1.0) / 2.0; + double cn1 = (double)n1[state] * (n1[state] - 1.0) / 2.0; + conc_val += cn0 * (total_choose_n1 - cn1); + } + + concordant(s, c) = conc_val; + decisive(s, c) = conc_val + (0.5 * (sum_prod * sum_prod - sum_sq_prod)); + } + + // Reset buffers for next split + for (int state : active_states) { + n0[state] = 0; + n1[state] = 0; + } + } + } + + return List::create(_["concordant"] = concordant, _["decisive"] = decisive); +} diff --git a/tests/testthat/test-Concordance.R b/tests/testthat/test-Concordance.R index b4ffdc786..6f46b05eb 100644 --- a/tests/testthat/test-Concordance.R +++ b/tests/testthat/test-Concordance.R @@ -4,8 +4,10 @@ test_that("_Concordance() handles null input", { expect_warning(expect_null(QuartetConcordance(BalancedTree(8), NULL))) expect_warning(expect_null(PhylogeneticConcordance(BalancedTree(8), NULL))) expect_warning(expect_null(ClusteringConcordance(BalancedTree(8), NULL))) - expect_warning(expect_null(MutualClusteringConcordance(BalancedTree(8), NULL))) - expect_warning(expect_null(SharedPhylogeneticConcordance(BalancedTree(8), NULL))) + expect_warning(expect_null( + MutualClusteringConcordance(BalancedTree(8), NULL))) + expect_warning(expect_null( + SharedPhylogeneticConcordance(BalancedTree(8), NULL))) }) test_that("_Concordance() handles tip mismatch", { @@ -13,6 +15,8 @@ test_that("_Concordance() handles tip mismatch", { tree <- BalancedTree(5) expect_warning(expect_null(QuartetConcordance(tree, char)), "No overlap between tree labels and dataset.") + expect_warning(expect_null(ClusteringConcordance(tree, char)), + "Could not find 't1', .* in names.dataset.") }) test_that("QuartetConcordance() works", { @@ -76,6 +80,7 @@ test_that("QuartetConcordance() works", { }) test_that("QuartetConcordance() handles ambiguity", { + library("TreeTools", quietly = TRUE) tree <- BalancedTree(12) splits <- as.Splits(tree) mataset <- matrix(c(0, 0, "{01}", 0, 0, "{01}", 1, 1, "-", 1, 1, "-", @@ -114,11 +119,59 @@ test_that("QuartetConcordance() handles incomplete data", { expect_equal(unname(QuartetConcordance(tree, dat)), rep(NA_real_, 5)) }) -dataset <- congreveLamsdellMatrices[[10]][, 1] -tree <- TreeTools::NJTree(dataset) +test_that(".Rezero() works", { + expect_equal(.Rezero(seq(0, 1, by = 0.1), 0.1), -1:9 / 9) +}) -ConcordantInformation(tree, dataset)["noise"] -TreeLength(tree, dataset, concavity = "prof") +test_that("ClusteringConcordance() gives sensible values", { + tree <- BalancedTree(8) + splits <- as.Splits(tree) + # None of these characters are informative + mataset <- matrix(c(0, 0, 0, 0, 0, 0, 0, 1, + rep("?", 8)), 8, + dimnames = list(paste0("t", 1:8), NULL)) + dat <- MatrixToPhyDat(mataset) + + expect_equal(unname(ClusteringConcordance(tree, dat)), rep(NA_real_, 5)) + + tree <- ape::read.tree(text = "((a, b, c, d, e), (f, g, h));") + split <- as.Splits(tree) + + mataset <- matrix(c(0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 1, 1, 1, # Matches split + 0, 0, 0, 0, 1, 1, 1, 1, # Consistent but not identical + 0, 0, 0, 1, 1, 1, 1, 1, # Consistent, more different + 0, 0, 0, 0, 0, 0, 1, 1, # Consistent other way + 0, 1, 0, 1, 0, 1, 0, 1, # Worst possible + 0, 0, 0, 0, rep("?", 4), # No information + 0, 0, 1, 1, rep("?", 4), # No relevant information + rep("?", 8)), 8, + dimnames = list(letters[1:8], NULL)) + dat <- MatrixToPhyDat(mataset) + cc <- ClusteringConcordance(tree, dat, return = "all")[, "10", ] + .Entropy <- function(...) { + TreeDist::Entropy(c(...) / sum(...)) + } + .NormExp <- function(a, b, ab) { + .Rezero( + (.Entropy(a) + .Entropy(b) - .Entropy(ab)) / .Entropy(a), + .ExpectedMI(a, b) / .Entropy(a) + ) + } + expect_equal(cc["normalized", ], + c(NA_real_, 1, + .NormExp(c(3, 5), c(4, 4), c(1, 3, 4)), + .NormExp(c(3, 5), c(3, 5), c(2, 3, 3)), + .NormExp(c(2, 6), c(3, 5), c(2, 1, 5)), + .NormExp(c(3, 5), c(4, 4), c(2, 1, 2, 3)), + NA, NA, NA)) + + randomset <- matrix(sample(0:1, 8 * 1000, replace = TRUE), 8, + dimnames = list(letters[1:8], NULL)) + rat <- MatrixToPhyDat(randomset) + expect_equal(ClusteringConcordance(tree, rat, normalize = TRUE), c("10" = 0), + tolerance = 0.05) +}) test_that("ConcordantInformation() works", { data(congreveLamsdellMatrices) @@ -148,3 +201,11 @@ test_that("ConcordantInformation() works", { log2(3) - log2(3)), ci["ignored"]) }) + +test_that("QACol() handles input", { + expect_equal(is.na(QACol(c(0, 1, NA, 0, NA), c(0, 1, NA, NA, 0))), + c(FALSE, FALSE, TRUE, + FALSE, # No data = black, quality NA by definition + TRUE)) + expect_equal(is.na(QCol(NA, c(0, 1, NA, NA))), c(FALSE, FALSE, TRUE, TRUE)) +})