diff --git a/SQANTI-sc_env.yml b/SQANTI-sc_env.yml index 629d154..9f8e0d1 100644 --- a/SQANTI-sc_env.yml +++ b/SQANTI-sc_env.yml @@ -2,7 +2,6 @@ name: SQANTI-sc_env channels: - conda-forge - bioconda - - defaults dependencies: - argcomplete=3.4.0 - bcbio-gff=0.7.1 @@ -48,7 +47,6 @@ dependencies: - r-htmltools=0.5.8.1 - r-jsonlite=1.8.9 - r-optparse=1.7.5 - - r-plotly=4.10.4 - r-plyr=1.8.9 - r-purrr=1.0.2 - r-randomForest=4.7 diff --git a/src/cell_metrics.py b/src/cell_metrics.py index a818b18..05ac3be 100644 --- a/src/cell_metrics.py +++ b/src/cell_metrics.py @@ -134,36 +134,62 @@ def safe_prop(numer, denom): summary['Novel_genes'] = cls_valid[~anno].groupby('CB')['associated_gene'].nunique().reindex(summary.index, fill_value=0) if not junc.empty: - if 'CB' not in junc.columns or (junc['CB'].fillna('') == '').all(): - iso_to_cb = cls_valid[['isoform','CB']].dropna().drop_duplicates() - junc = pd.merge(junc, iso_to_cb, on='isoform', how='left') - jv = junc[(junc['CB'].notna()) & (junc['CB'] != '')].copy() - if not jv.empty: - jv['junction_type'] = jv['junction_category'].astype(str) + '_' + jv['canonical'].astype(str) - counts = jv.groupby(['CB','junction_type']).size().unstack(fill_value=0) - for tp in ['known_canonical','known_non_canonical','novel_canonical','novel_non_canonical']: + junc_types = ['known_canonical', 'known_non_canonical', 'novel_canonical', 'novel_non_canonical'] + junc_rename = { + 'known_canonical': 'Known_canonical_junctions', + 'known_non_canonical': 'Known_non_canonical_junctions', + 'novel_canonical': 'Novel_canonical_junctions', + 'novel_non_canonical': 'Novel_non_canonical_junctions' + } + + if args.mode == 'isoforms': + # In isoforms mode the junction file's CB column is a comma-separated + # list (same as the classification file). cls_valid is already exploded + # to one row per (isoform, CB) with _count = FL for that cell. + # Join junctions to cls_valid by isoform ID so each junction gets + # replicated once per cell, weighted by that cell's FL count. + iso_col = next((c for c in ['isoform', 'readID', 'read_id', 'ID', 'read_name', 'read'] + if c in junc.columns and c in cls_valid.columns), None) + if iso_col is not None and 'junction_category' in junc.columns and 'canonical' in junc.columns: + jv = pd.merge( + junc[[iso_col, 'junction_category', 'canonical']], + cls_valid[[iso_col, 'CB', '_count']].drop_duplicates(), + on=iso_col, how='inner' + ) + jv['junction_type'] = jv['junction_category'].astype(str) + '_' + jv['canonical'].astype(str) + counts = jv.groupby(['CB', 'junction_type'])['_count'].sum().unstack(fill_value=0) + else: + counts = pd.DataFrame(index=summary.index) + else: + # Reads mode: each junction row has a single CB; count rows. + if 'CB' not in junc.columns or (junc['CB'].fillna('') == '').all(): + iso_to_cb = cls_valid[['isoform', 'CB']].dropna().drop_duplicates() if 'isoform' in cls_valid.columns else pd.DataFrame() + if not iso_to_cb.empty and 'isoform' in junc.columns: + junc = pd.merge(junc, iso_to_cb, on='isoform', how='left') + jv = junc[(junc['CB'].notna()) & (junc['CB'] != '')].copy() + if not jv.empty: + jv['junction_type'] = jv['junction_category'].astype(str) + '_' + jv['canonical'].astype(str) + counts = jv.groupby(['CB', 'junction_type']).size().unstack(fill_value=0) + else: + counts = pd.DataFrame(index=summary.index) + + if not counts.empty: + for tp in junc_types: if tp not in counts.columns: counts[tp] = 0 - counts['total_junctions'] = counts.sum(axis=1) - counts = counts.rename(columns={ - 'known_canonical':'Known_canonical_junctions', - 'known_non_canonical':'Known_non_canonical_junctions', - 'novel_canonical':'Novel_canonical_junctions', - 'novel_non_canonical':'Novel_non_canonical_junctions' - }) - for src, dst in [ - ('Known_canonical_junctions','Known_canonical_junctions_prop'), - ('Known_non_canonical_junctions','Known_non_canonical_junctions_prop'), - ('Novel_canonical_junctions','Novel_canonical_junctions_prop'), - ('Novel_non_canonical_junctions','Novel_non_canonical_junctions_prop')]: + counts['total_junctions'] = counts[junc_types].sum(axis=1) + counts = counts.rename(columns=junc_rename) + for src, dst in [(v, f"{v}_prop") for v in junc_rename.values()]: counts[dst] = safe_prop(counts[src].reindex(counts.index, fill_value=0), counts['total_junctions']) summary = summary.join(counts, how='left').fillna(0) else: - summary[['Known_canonical_junctions','Known_non_canonical_junctions','Novel_canonical_junctions','Novel_non_canonical_junctions','total_junctions', - 'Known_canonical_junctions_prop','Known_non_canonical_junctions_prop','Novel_canonical_junctions_prop','Novel_non_canonical_junctions_prop']] = 0 + for col in list(junc_rename.values()) + [f"{v}_prop" for v in junc_rename.values()] + ['total_junctions']: + summary[col] = 0 else: - summary[['Known_canonical_junctions','Known_non_canonical_junctions','Novel_canonical_junctions','Novel_non_canonical_junctions','total_junctions', - 'Known_canonical_junctions_prop','Known_non_canonical_junctions_prop','Novel_canonical_junctions_prop','Novel_non_canonical_junctions_prop']] = 0 + summary[['Known_canonical_junctions', 'Known_non_canonical_junctions', + 'Novel_canonical_junctions', 'Novel_non_canonical_junctions', 'total_junctions', + 'Known_canonical_junctions_prop', 'Known_non_canonical_junctions_prop', + 'Novel_canonical_junctions_prop', 'Novel_non_canonical_junctions_prop']] = 0 sublevels = { 'full-splice_match': ['alternative_3end','alternative_3end5end','alternative_5end','reference_match','mono-exon'], @@ -282,7 +308,10 @@ def compute_lenbins_by_cb(df_group): # Reference body coverage: parameterized threshold and export cutoff for plotting ref_cov_min = float(getattr(args, 'ref_cov_min_pct', 45.0)) cls_valid['ref_body_cov_flag'] = (cls_valid['length'] / cls_valid['ref_length'] * 100.0) >= ref_cov_min - for cat in structural_categories: + # Only FSM and ISM have a meaningful associated reference transcript and ref_length; + # other categories (NIC, NNC, Genic, etc.) should not have ref_coverage reported. + ref_cov_categories = ['full-splice_match', 'incomplete-splice_match'] + for cat in ref_cov_categories: tag = cat_to_tag[cat] sub = cls_valid[cls_valid['structural_category'] == cat] denom = summary[final_count_name(cat)] diff --git a/src/qc_reports.py b/src/qc_reports.py index b68d5e6..fd24ce3 100644 --- a/src/qc_reports.py +++ b/src/qc_reports.py @@ -43,10 +43,13 @@ def generate_report(args, df): clustering_file = os.path.join(os.path.dirname(outputPathPrefix), "clustering", "umap_results.csv") if os.path.isfile(clustering_file): flags.extend(["--clustering", clustering_file]) + + if hasattr(args, 'refGTF') and args.refGTF: + flags.extend(["--refGTF", f'"{args.refGTF}"']) cmd = ( f"Rscript {reportAssetsPath}/SQANTI-sc_report.R " - f"{class_file} {junc_file} {args.report} {outputPathPrefix} " + f"\"{class_file}\" \"{junc_file}\" {args.report} \"{outputPathPrefix}\" " f"{args.mode} {' '.join(flags)}" ) subprocess.run(cmd, shell=True, check=True) diff --git a/src/report_assets/SQANTI-sc_multisample_report.R b/src/report_assets/SQANTI-sc_multisample_report.R index a198cde..76325d0 100644 --- a/src/report_assets/SQANTI-sc_multisample_report.R +++ b/src/report_assets/SQANTI-sc_multisample_report.R @@ -1,1367 +1,879 @@ -#!/usr/env/bin Rscript - -############################################################ -##### SQANTI single-cell multisample report generation ##### -############################################################ - - - -### Author: Carlos Blanco - -#********************** Packages - -# !/usr/bin/env Rscript - -suppressPackageStartupMessages({ - library(dplyr) - library(tidyr) - library(ggplot2) - library(gridExtra) - library(grid) - library(ggdist) - library(plotly) - library(stringr) - library(rmarkdown) -}) - -# Ensure RColorConesa is available; if not, install from GitHub, then load -if (!requireNamespace("RColorConesa", quietly = TRUE)) { - if (!requireNamespace("devtools", quietly = TRUE)) { - install.packages("devtools") - } - devtools::install_github("ConesaLab/RColorConesa") -} -library(RColorConesa) - - -parse_args <- function() { - args <- commandArgs(trailingOnly = TRUE) - # Simple flag parser: expects --key value pairs, and a single string for --files (comma-separated) - res <- list(files = NULL, out_dir = ".", mode = "reads", report = "pdf", prefix = "SQANTI_sc_multi_report") - i <- 1 - while (i <= length(args)) { - key <- args[i] - if (startsWith(key, "--")) { - k <- substring(key, 3) - if (k %in% c("files", "out_dir", "mode", "report", "prefix")) { - if (i + 1 <= length(args)) { - res[[k]] <- args[i + 1] - i <- i + 2 - next - } else { - stop(sprintf("Missing value for flag %s", key)) - } - } else { - stop(sprintf("Unknown flag: %s", key)) - } - } else { - stop(sprintf("Unexpected argument: %s", key)) - } - } - if (is.null(res$files) || !nzchar(res$files)) { - stop("--files must be provided (comma-separated list of cell summary files)") - } - if (!(res$report %in% c("pdf", "html", "both"))) { - stop("--report must be one of: pdf, html, both") - } - res -} - -safe_read_summary <- function(fpath) { - # read.table supports gz automatically - df <- tryCatch( - { - read.table(fpath, header = TRUE, sep = "\t", stringsAsFactors = FALSE, check.names = FALSE) - }, - error = function(e) { - message(sprintf("[ERROR] Failed to read summary %s: %s", fpath, e$message)) - return(NULL) - } - ) - if (is.null(df)) { - return(NULL) - } - if (ncol(df) < 2 || !("CB" %in% colnames(df))) { - message(sprintf("[WARNING] Summary %s does not have expected structure; skipping", fpath)) - return(NULL) - } - # Coerce numeric columns (col 2..n) to numeric - if (ncol(df) >= 2) { - for (j in 2:ncol(df)) { - df[[j]] <- suppressWarnings(as.numeric(df[[j]])) - } - } - # Derive sampleID from filename: _SQANTI_cell_summary.txt.gz - base <- basename(fpath) - sample <- sub("_SQANTI_cell_summary\\.txt(\\.gz)?$", "", base) - df$sampleID <- sample - df -} - -# Helper: tidy feature names for titles and subtitles -format_feature_display_name <- function(feature) { - cleaned <- feature - suffixes <- c( - "_prop_in_cell$", "_perc_in_cell$", "_prop$", "_perc$", - "_percentage$", "_pct$", "_ratio$", "_count$", "_counts$", - "_in_cell$", "_per_cell$", "_value$" - ) - for (pattern in suffixes) { - cleaned <- gsub(pattern, "", cleaned, ignore.case = TRUE) - } - cleaned <- gsub("([0-9]+)b", "\\1 bp", cleaned, ignore.case = TRUE) - cleaned <- stringr::str_replace_all(cleaned, "_+", " ") - cleaned <- stringr::str_squish(cleaned) - if (cleaned == "") cleaned <- feature - tokens <- unlist(strsplit(cleaned, " ", fixed = FALSE)) - if (length(tokens) == 0) { - return(feature) - } - replacements <- c( - "Fsm" = "FSM", - "Ism" = "ISM", - "Nic" = "NIC", - "Nnc" = "NNC", - "Rts" = "RTS", - "Tss" = "TSS", - "Nmd" = "NMD", - "Cage" = "CAGE", - "Ujcs" = "UJCs", - "Ujc" = "UJC", - "Umis" = "UMIs", - "Umi" = "UMI", - "Mt" = "MT", - "Cb" = "CB", - "Pc" = "PC", - "Qc" = "QC", - "Orf" = "ORF", - "Tpm" = "TPM" - ) - normalize_token <- function(tok) { - if (tok == "") { - return(tok) - } - if (grepl("-", tok, fixed = TRUE)) { - parts <- strsplit(tok, "-", fixed = TRUE)[[1]] - parts <- vapply(parts, normalize_token, character(1), USE.NAMES = FALSE) - return(paste(parts, collapse = "-")) - } - if (tok == toupper(tok)) { - return(tok) - } - if (grepl("^[0-9]+(\\.[0-9]+)?$", tok)) { - return(tok) - } - if (grepl("^[0-9]+bp$", tok, ignore.case = TRUE)) { - return(gsub("bp$", "bp", tok, ignore.case = TRUE)) - } - key <- stringr::str_to_title(tok) - if (key %in% names(replacements)) { - return(replacements[[key]]) - } - stringr::str_to_title(tok) - } - tokens <- vapply(tokens, normalize_token, character(1), USE.NAMES = FALSE) - stringr::str_squish(paste(tokens, collapse = " ")) -} - -# Helper: derive axis labels and scaling behaviour from feature metadata -infer_feature_metadata <- function(feature, values) { - name_lower <- tolower(feature) - domain <- "Value" - if (stringr::str_detect(name_lower, "length")) domain <- "Length" - if (stringr::str_detect(name_lower, "length") && stringr::str_detect(name_lower, "prop|perc|pct|ratio|fraction")) { - if (exists("params") && params$mode == "isoforms") domain <- "Transcripts" else domain <- "Reads" - } - if (stringr::str_detect(name_lower, "read")) { - if (exists("params") && params$mode == "isoforms") domain <- "Transcripts" else domain <- "Reads" - } - if (stringr::str_detect(name_lower, "\\bmt\\b") || stringr::str_detect(name_lower, "^mt_")) domain <- "Reads" - if (stringr::str_detect(name_lower, "exon")) domain <- "Exons" - if (stringr::str_detect(name_lower, "intron")) domain <- "Introns" - if (stringr::str_detect(name_lower, "coverage")) domain <- "Coverage" - if (stringr::str_detect(name_lower, "isoform")) domain <- "Isoforms" - if (stringr::str_detect(name_lower, "transcript")) domain <- "Transcripts" - if (stringr::str_detect(name_lower, "gene")) domain <- "Genes" - if (stringr::str_detect(name_lower, "umi")) domain <- "UMIs" - if (stringr::str_detect(name_lower, "junction")) domain <- "Junctions" - structural_keywords <- c( - "fsm", "ism", "nic", "nnc", - "genic_genomic", "genic genomic", "genic", - "antisense", "fusion", "intergenic", - "genic_intron", "genic intron" - ) - if (any(stringr::str_detect(name_lower, structural_keywords))) { - if (exists("params") && params$mode == "isoforms") domain <- "Transcripts" else domain <- "Reads" - } - is_prop_keyword <- stringr::str_detect(name_lower, "prop|perc|pct|ratio|fraction") - finite_vals <- values[is.finite(values)] - unit <- if (is_prop_keyword) "%" else "count" - scale_to_percent <- FALSE - if (length(finite_vals) > 0) { - maxv <- max(finite_vals) - minv <- min(finite_vals) - if (unit == "%" && maxv <= 1.5) { - scale_to_percent <- TRUE - } - if (unit == "count" && !is_prop_keyword && maxv <= 1.5 && minv >= 0) { - unit <- "%" - scale_to_percent <- TRUE - } - if (domain == "Length" && unit == "count") { - unit <- "bp" - } - } else { - if (unit == "%") scale_to_percent <- TRUE - } - if (unit == "%" && domain == "Value") { - domain <- "Percentage" - } - value_label <- switch(unit, - "%" = if (domain %in% c("Percentage", "Value")) "Value, %" else sprintf("%s, %%", domain), - "bp" = sprintf("%s, bp", domain), - sprintf("%s, count", domain) - ) - if (domain == "Value" && unit == "count") { - value_label <- "Value" - } - value_label <- infer_junction_display_label(feature, value_label) - list( - display_name = format_feature_display_name(feature), - value_label = value_label, - scale_to_percent = scale_to_percent, - unit = unit, - domain = domain - ) -} - -to_rgba <- function(col, alpha = 1) { - if (is.null(col) || is.na(col) || !nzchar(col)) { - return(sprintf("rgba(0,0,0,%.3f)", alpha)) - } - rgb <- grDevices::col2rgb(col) - sprintf("rgba(%d,%d,%d,%.3f)", rgb[1], rgb[2], rgb[3], alpha) -} - -infer_junction_display_label <- function(feature_name, current_label) { - lower_name <- tolower(feature_name) - junction_keywords <- c("junction", "junctions", "splice", "sj", "canonical", "noncanonical", "ujc", "ujcs") - contains_junction <- any(vapply(junction_keywords, function(kw) grepl(kw, lower_name, fixed = TRUE), logical(1))) - if (!contains_junction) { - return(current_label) - } - if (grepl("junct", current_label, ignore.case = TRUE)) { - return(current_label) - } - if (grepl("%", current_label, fixed = TRUE)) { - return(sub("%", " (junctions, %)", current_label, fixed = TRUE)) - } - if (grepl("count", current_label, ignore.case = TRUE) || grepl("junction", current_label, ignore.case = TRUE)) { - return(current_label) - } - paste0(current_label, ", junctions") -} - -get_conesa_palette_colors <- function(n, palette = "complete") { - if (n <= 0) { - return(character(0)) - } - - col_vec <- NULL - if (requireNamespace("RColorConesa", quietly = TRUE)) { - col_vec <- tryCatch( - { - scale_obj <- RColorConesa::scale_fill_conesa(palette = palette) - if (!is.null(scale_obj$palette) && is.function(scale_obj$palette)) { - scale_obj$palette(n) - } else { - NULL - } - }, - error = function(e) NULL - ) - - if (is.null(col_vec) || length(col_vec) == 0) { - col_vec <- tryCatch( - { - ns <- asNamespace("RColorConesa") - if (exists("conesa_palettes", envir = ns, inherits = FALSE)) { - pal_list <- get("conesa_palettes", envir = ns, inherits = FALSE) - pal_entry <- pal_list[[palette]] - if (is.function(pal_entry)) { - pal_entry(n) - } else if (is.vector(pal_entry)) { - unname(pal_entry) - } else { - NULL - } - } else { - NULL - } - }, - error = function(e) NULL - ) - } - - if ((is.null(col_vec) || length(col_vec) == 0) && exists("palette_conesa", envir = asNamespace("RColorConesa"), inherits = FALSE)) { - pal_fun <- get("palette_conesa", envir = asNamespace("RColorConesa"), inherits = FALSE) - col_vec <- tryCatch(pal_fun(palette, n), error = function(e) NULL) - } - } - - if (is.null(col_vec) || length(col_vec) == 0) { - fallback <- c( - "#6BAED6", "#FC8D59", "#78C679", "#EE6A50", "#969696", - "#66C2A4", "#FFD92F", "#E78AC3", "#A6D854", "#8DA0CB", - "#E5C494", "#B3B3B3" - ) - col_vec <- fallback - } - - rep_len(col_vec, n) -} - -# Helper: build per-feature violin + boxplot for a PCA loading -build_loading_feature_plot <- function(multi, feature_info, sample_levels) { - feature_name <- as.character(feature_info$variable)[1] - if (!feature_name %in% colnames(multi)) { - return(NULL) - } - values <- multi[[feature_name]] - if (!is.numeric(values)) { - return(NULL) - } - plot_df <- multi %>% - select(sampleID, value = all_of(feature_name)) %>% - mutate(value = as.numeric(value)) %>% - filter(is.finite(value)) - if (nrow(plot_df) == 0) { - return(NULL) - } - info <- infer_feature_metadata(feature_name, plot_df$value) - if (info$scale_to_percent) { - plot_df <- plot_df %>% mutate(value = value * 100) - } - if (length(sample_levels) == 0) { - sample_levels <- unique(plot_df$sampleID) - } - plot_df <- plot_df %>% mutate(sampleID = factor(sampleID, levels = sample_levels)) - uniqueness <- plot_df %>% - group_by(sampleID) %>% - summarise(unique_vals = n_distinct(value), .groups = "drop") - use_violin <- any(uniqueness$unique_vals > 1) - loading_value <- as.numeric(feature_info$loading)[1] - loading_rank <- as.integer(feature_info$rank)[1] - pc_label <- as.character(feature_info$PC)[1] - gp <- ggplot(plot_df, aes(x = sampleID, y = value, fill = sampleID, colour = sampleID)) - if (use_violin) { - gp <- gp + geom_violin(trim = TRUE, scale = "width", alpha = 0.7, linewidth = 0.3) - } - gp <- gp + - geom_boxplot( - width = 0.05, outlier.shape = NA, - alpha = 0.3, - colour = "grey20" - ) + - stat_summary( - fun = mean, geom = "point", shape = 4, size = 1, - colour = "red", stroke = 0.45 - ) + - scale_fill_conesa(palette = "complete", drop = FALSE) + - scale_color_conesa(palette = "complete", guide = "none", drop = FALSE) + - labs( - title = sprintf("Per Sample %s Distribution Across Cells", info$display_name), - subtitle = sprintf("%s loading rank #%d (loading = %.3f)", pc_label, loading_rank, loading_value), - x = "Sample", - y = info$value_label - ) + - theme_classic(base_size = 16) + - theme( - legend.position = "none", - plot.title = element_text(size = 18, face = "bold", hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5), - axis.title = element_text(size = 16), - axis.text.x = element_text(size = 14, angle = 35, hjust = 1), - axis.text.y = element_text(size = 14) - ) - plot_df_html <- plot_df - plot_df_html$sampleID <- as.character(plot_df_html$sampleID) - hover_tmpl <- sprintf("Sample: %%{x}
%s: %%{y:.3f}", info$value_label) - - sample_levels_html <- if (!is.null(sample_levels) && length(sample_levels) > 0) sample_levels else unique(plot_df_html$sampleID) - palette_cols <- get_conesa_palette_colors(length(sample_levels_html), palette = "complete") - sample_color_map <- setNames(palette_cols, sample_levels_html) - - plt_html <- plotly::plot_ly() - for (idx in seq_along(sample_levels_html)) { - sample_nm <- sample_levels_html[[idx]] - sample_df <- plot_df_html %>% filter(sampleID == sample_nm) - if (nrow(sample_df) == 0) next - col_val <- sample_color_map[[sample_nm]] - violin_fill <- to_rgba(col_val, 0.7) - box_fill <- to_rgba(col_val, 0.3) - line_col <- to_rgba(col_val, 1) - plt_html <- plt_html %>% - plotly::add_trace( - data = sample_df, - x = ~sampleID, - y = ~value, - type = "violin", - name = sample_nm, - legendgroup = sample_nm, - showlegend = TRUE, - hovertemplate = hover_tmpl, - fillcolor = violin_fill, - line = list(color = line_col, width = 1.1), - spanmode = "hard", - scalemode = "width", - width = 0.85, - points = "none", - box = list(visible = FALSE), - meanline = list(visible = FALSE) - ) %>% - plotly::add_trace( - data = sample_df, - x = ~sampleID, - y = ~value, - type = "box", - name = paste0(sample_nm, " (IQR)"), - legendgroup = sample_nm, - showlegend = FALSE, - hoverinfo = "skip", - fillcolor = box_fill, - line = list(color = to_rgba("#333333", 1), width = 1), - boxpoints = FALSE, - width = 0.05 - ) - } - - mean_df <- plot_df_html %>% - group_by(sampleID) %>% - summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop") - if (nrow(mean_df) > 0) { - plt_html <- plt_html %>% plotly::add_trace( - data = mean_df, - x = ~sampleID, - y = ~mean_value, - type = "scatter", - mode = "markers", - name = "Mean", - legendgroup = "Mean", - hovertemplate = hover_tmpl, - marker = list(symbol = "x-thin", size = 8, color = "red", line = list(width = 0)), - showlegend = FALSE - ) - } - - plt_html <- plt_html %>% plotly::layout( - title = list( - text = sprintf("Per Sample %s Distribution Across Cells", info$display_name), - x = 0.5, - xanchor = "center", - font = list(size = 18, family = "Arial") - ), - xaxis = list( - title = list(text = "Sample", font = list(size = 16, family = "Arial")), - tickfont = list(size = 14, family = "Arial"), - tickangle = 45, - showline = TRUE, - linecolor = "#000000", - linewidth = 1.1, - mirror = FALSE, - zeroline = FALSE, - standoff = 26, - automargin = TRUE - ), - yaxis = list( - title = list(text = info$value_label, font = list(size = 16, family = "Arial")), - tickfont = list(size = 14, family = "Arial"), - showline = TRUE, - linecolor = "#000000", - linewidth = 1.1, - zeroline = FALSE, - standoff = 8, - automargin = TRUE - ), - legend = list( - orientation = "h", - x = 0.5, - xanchor = "center", - y = -0.25, - yanchor = "top", - font = list(size = 14, family = "Arial"), - title = list(text = "") - ), - margin = list(t = 60, b = 250, l = 130, r = 80), - hovermode = "closest", - paper_bgcolor = "rgba(0,0,0,0)", - plot_bgcolor = "rgba(0,0,0,0)", - font = list(family = "Arial", size = 14), - height = 700 - ) - - list(ggplot = gp, plotly = plt_html) -} - -main <- function() { - params <- parse_args() - - files <- unlist(strsplit(params$files, ",", fixed = TRUE)) - files <- trimws(files) - files <- files[nchar(files) > 0] - if (length(files) < 2) { - message("[INFO] Fewer than 2 files provided. Nothing to do.") - quit(status = 0) - } - - # Read all summaries - lst <- lapply(files, safe_read_summary) - lst <- Filter(Negate(is.null), lst) - if (length(lst) < 2) { - message("[INFO] Fewer than 2 valid summaries after reading. Skipping.") - quit(status = 0) - } - - # Harmonize columns: union of all names; fill missing with 0 for numeric, "" otherwise - all_cols <- Reduce(union, lapply(lst, colnames)) - # Ensure CB and sampleID exist in final order front - all_cols <- unique(c("CB", setdiff(all_cols, "CB"))) - all_cols <- unique(c(all_cols, "sampleID")) - - norm_list <- lapply(lst, function(df) { - missing <- setdiff(all_cols, colnames(df)) - for (m in missing) { - df[[m]] <- if (m == "CB" || m == "sampleID") "" else 0 - } - # Reorder - df <- df[, all_cols] - df - }) - multi <- bind_rows(norm_list) - - sample_levels_global <- unique(multi$sampleID[!is.na(multi$sampleID)]) - - render_pdf <- params$report %in% c("pdf", "both") - render_html <- params$report %in% c("html", "both") - - # Basic cohort-level aggregates per sample - count_col <- if (params$mode == "isoforms") "Transcripts_in_cell" else "Reads_in_cell" - - per_sample_stats <- multi %>% - group_by(sampleID) %>% - summarise( - cells = n_distinct(CB), - mean_reads = mean(.data[[count_col]], na.rm = TRUE), - mean_umis = if ("UMIs_in_cell" %in% names(.)) mean(UMIs_in_cell, na.rm = TRUE) else NA, - median_reads = median(.data[[count_col]], na.rm = TRUE), - mean_genes = mean(Genes_in_cell, na.rm = TRUE), - median_genes = median(Genes_in_cell, na.rm = TRUE), - mean_annotated = mean(Annotated_genes, na.rm = TRUE), - mean_novel = mean(Novel_genes, na.rm = TRUE), - mean_ujc = if ("UJCs_in_cell" %in% names(.)) mean(UJCs_in_cell, na.rm = TRUE) else NA, - median_ujc = if ("UJCs_in_cell" %in% names(.)) median(UJCs_in_cell, na.rm = TRUE) else NA, - mean_mt = mean(MT_perc, na.rm = TRUE) - ) - - entity_label_plural <- if (params$mode == "isoforms") "Transcripts" else "Reads" - - summary_tbl <- per_sample_stats %>% - mutate(across(where(is.numeric), ~ round(., 3))) %>% - transmute( - Sample = sampleID, - `Cell\nBarcodes` = cells, - `Average\nReads` = mean_reads, - `Average\nUMIs` = mean_umis, - `Average\nAnnotated\nGenes` = mean_annotated, - `Average\nNovel\nGenes` = mean_novel, - `Average\nUJCs` = mean_ujc, - `Average\nMitochondrial\nReads` = mean_mt - ) - - # Rename columns dynamically - colnames(summary_tbl)[colnames(summary_tbl) == "Average\nReads"] <- paste0("Average\n", entity_label_plural) - colnames(summary_tbl)[colnames(summary_tbl) == "Average\nMitochondrial\nReads"] <- paste0("Average\nMitochondrial\n", entity_label_plural) - - if (params$mode == "isoforms") { - summary_tbl <- summary_tbl %>% select(-`Average\nUMIs`, -`Average\nUJCs`) - } - - summary_tbl_html <- summary_tbl - colnames(summary_tbl_html) <- gsub("\\n", "
", colnames(summary_tbl_html), fixed = TRUE) - - assign("multi_per_sample_stats", per_sample_stats, envir = .GlobalEnv) - assign("multi_summary_tbl_pdf", summary_tbl, envir = .GlobalEnv) - assign("multi_summary_tbl_html", summary_tbl_html, envir = .GlobalEnv) - assign("entity_label", if (params$mode == "isoforms") "Transcript" else "Read", envir = .GlobalEnv) - assign("entity_label_plural", entity_label_plural, envir = .GlobalEnv) - assign("mode", params$mode, envir = .GlobalEnv) - - # Output path - out_dir <- params$out_dir - if (!dir.exists(out_dir)) { - dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) - } - pdf_out <- file.path(out_dir, paste0(params$prefix, ".pdf")) - - if (render_html) { - message("[INFO] HTML report requested; plots will be rendered via R Markdown template.") - } - - # Structural category proportions, if available - cat_cols <- c("FSM_prop", "ISM_prop", "NIC_prop", "NNC_prop", "Genic_Genomic_prop", "Antisense_prop", "Fusion_prop", "Intergenic_prop", "Genic_intron_prop") - have_cats <- all(cat_cols %in% colnames(multi)) - if (have_cats) { - cats_long <- multi %>% - select(all_of(c("sampleID", cat_cols))) %>% - pivot_longer(cols = all_of(cat_cols), names_to = "category", values_to = "prop") %>% - mutate( - sampleID = factor(sampleID, levels = unique(multi$sampleID)), - category = factor(category, - levels = c( - "FSM_prop", "ISM_prop", "NIC_prop", "NNC_prop", - "Genic_Genomic_prop", "Antisense_prop", "Fusion_prop", "Intergenic_prop", "Genic_intron_prop" - ), - labels = c( - "FSM", "ISM", "NIC", "NNC", - "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic intron" - ) - ) - ) %>% - mutate(prop = suppressWarnings(as.numeric(prop))) - - p_cats <- ggplot(cats_long, aes(x = category, y = prop, fill = sampleID, colour = sampleID, group = sampleID)) + - ggdist::stat_slabinterval( - side = "left", - position = position_dodge(width = 0.6), - density = "unbounded", - bw = "nrd0", - normalize = "groups", scale = 0.85, adjust = 2.5, trim = TRUE, - show_point = FALSE, show_interval = FALSE, # slab only (fill only) - slab_colour = NA, alpha = 0.7 - ) + - ggdist::stat_slabinterval( - side = "left", - position = position_dodge(width = 0.6), - density = "unbounded", - bw = "nrd0", - normalize = "groups", scale = 0.85, adjust = 2.5, trim = TRUE, - show_point = FALSE, show_interval = FALSE, # outline-only - mapping = aes(slab_colour = after_scale(colour)), - fill = NA, slab_linewidth = 0.05, show.legend = FALSE - ) + - # Draw medians as short horizontal lines centered within each dodged slab - stat_summary( - data = cats_long, aes(group = sampleID), - fun = median, fun.min = median, fun.max = median, - geom = "crossbar", width = 0.1, - position = position_dodge(width = 0.6), - color = "black", linewidth = 0.1, alpha = 1 - ) + - # Add a small lower expansion so slabs don't touch the x-axis - scale_y_continuous(limits = c(0, 100), expand = expansion(add = c(1, 0))) + - scale_fill_conesa(palette = "complete") + - scale_color_conesa(palette = "complete", guide = "none") + - guides( - fill = guide_legend(override.aes = list(shape = 15, size = 5, alpha = 0.95, colour = NA, stroke = 0)), - linetype = "none", alpha = "none", size = "none", colour = "none" - ) + - theme_classic(base_size = 12) + - labs( - title = "Structural Category Proportions by Sample", - x = "Structural category", y = "Reads, %" - ) + - theme( - legend.position = "bottom", - axis.text.x = element_text(angle = 45, hjust = 1, size = 14), - axis.title = element_text(size = 16), - axis.text.y = element_text(size = 14), - plot.title = element_text(size = 20, face = "bold", hjust = 0.5) - ) - - category_levels <- levels(cats_long$category) - sample_levels_all <- levels(cats_long$sampleID) - cat_to_col <- c( - "FSM" = "#6BAED6", - "ISM" = "#FC8D59", - "NIC" = "#78C679", - "NNC" = "#EE6A50", - "Genic Genomic" = "#969696", - "Antisense" = "#66C2A4", - "Fusion" = "goldenrod1", - "Intergenic" = "darksalmon", - "Genic intron" = "#41B6C4" - ) - - category_plots <- lapply(category_levels, function(cat_lab) { - dfp <- cats_long %>% filter(category == cat_lab) - cat_col <- unname(cat_to_col[[as.character(cat_lab)]]) - if (is.null(cat_col) || is.na(cat_col)) cat_col <- "grey60" - box_outline_col <- if (as.character(cat_lab) == "Genic Genomic") "grey90" else "grey20" - violin_fill <- grDevices::adjustcolor(cat_col, alpha.f = 0.7) - ggplot(dfp, aes(x = sampleID, y = prop)) + - geom_violin(fill = violin_fill, color = cat_col, linewidth = 0.3, width = 0.8, trim = TRUE) + - geom_boxplot(width = 0.05, outlier.shape = NA, fill = cat_col, color = box_outline_col, alpha = 0.3) + - stat_summary(fun = mean, geom = "point", shape = 4, size = 1, colour = "red", stroke = 0.9) + - scale_y_continuous(limits = c(0, 100), expand = expansion(add = c(1, 0))) + - theme_classic(base_size = 14) + - labs(title = paste0("Per Sample ", cat_lab, " Reads Distribution Across Cells"), x = "Sample", y = "Reads, %") + - theme( - legend.position = "none", - axis.text.x = element_text(angle = 0, hjust = 0.5, size = 14), - plot.title = element_text(size = 18, face = "bold", hjust = 0.5), - axis.title = element_text(size = 16), - axis.text.y = element_text(size = 14) - ) - }) - names(category_plots) <- as.character(category_levels) - - category_plots_html <- lapply(category_levels, function(cat_lab) { - dfp <- cats_long %>% - filter(category == cat_lab) %>% - mutate(sampleID = factor(sampleID, levels = sample_levels_all)) %>% - filter(is.finite(prop)) - - cat_col <- unname(cat_to_col[[as.character(cat_lab)]]) - if (is.null(cat_col) || is.na(cat_col)) cat_col <- "#6C757D" - violin_fill <- to_rgba(cat_col, 0.7) - box_fill <- to_rgba(cat_col, 0.3) - line_col <- to_rgba(cat_col, 1) - - plt <- plotly::plot_ly() - - for (sample_nm in levels(dfp$sampleID)) { - sample_df <- dfp %>% filter(sampleID == sample_nm) - if (nrow(sample_df) == 0) next - - plt <- plt %>% - plotly::add_trace( - data = sample_df, - x = ~sampleID, - y = ~prop, - type = "violin", - name = NULL, - showlegend = FALSE, - hovertemplate = "Sample: %{x}
Reads, %: %{y:.3f}", - fillcolor = violin_fill, - line = list(color = line_col, width = 1.1), - spanmode = "hard", - scalemode = "width", - width = 0.85, - points = "none", - box = list(visible = FALSE), - meanline = list(visible = FALSE) - ) %>% - plotly::add_trace( - data = sample_df, - x = ~sampleID, - y = ~prop, - type = "box", - name = NULL, - showlegend = FALSE, - hoverinfo = "skip", - fillcolor = box_fill, - line = list(color = to_rgba("#333333", 1), width = 1), - boxpoints = FALSE, - width = 0.05 - ) - } - - mean_df <- dfp %>% - group_by(sampleID) %>% - summarise(mean_prop = mean(prop, na.rm = TRUE), .groups = "drop") - if (nrow(mean_df) > 0) { - plt <- plt %>% plotly::add_trace( - data = mean_df, - x = ~sampleID, - y = ~mean_prop, - type = "scatter", - mode = "markers", - name = NULL, - showlegend = FALSE, - hovertemplate = "Sample: %{x}
Reads, %: %{y:.3f}", - marker = list(symbol = "x-thin", size = 8, line = list(width = 0), color = "red") - ) - } - - plt %>% - plotly::layout( - showlegend = FALSE, - title = list( - text = sprintf("Per Sample %s Reads Distribution Across Cells", cat_lab), - x = 0.5, - xanchor = "center", - font = list(size = 18, family = "Arial") - ), - xaxis = list( - title = list(text = "Sample", font = list(size = 16, family = "Arial")), - tickfont = list(size = 14, family = "Arial"), - tickangle = 45, - showline = TRUE, - linecolor = "#000000", - linewidth = 1.1, - mirror = FALSE, - zeroline = FALSE - ), - yaxis = list( - title = list(text = "Reads, %", font = list(size = 16, family = "Arial")), - tickfont = list(size = 14, family = "Arial"), - range = c(0, 100), - showline = TRUE, - linecolor = "#000000", - linewidth = 1.1, - zeroline = FALSE - ), - margin = list(t = 60, b = 160, l = 110, r = 60), - hovermode = "closest", - paper_bgcolor = "rgba(0,0,0,0)", - plot_bgcolor = "rgba(0,0,0,0)", - font = list(family = "Arial", size = 14), - height = 560 - ) - }) - names(category_plots_html) <- as.character(category_levels) - - assign("multi_structural_category_combined_plot", p_cats, envir = .GlobalEnv) - assign("multi_structural_category_violin_plots", category_plots, envir = .GlobalEnv) - assign("multi_structural_category_violin_plots_html", category_plots_html, envir = .GlobalEnv) - } - - multi_pca_scores_plot_local <- NULL - multi_pca_scores_plot_html_local <- NULL - multi_pca_scree_plot_local <- NULL - multi_pca_scree_plot_html_local <- NULL - multi_pca_top_loadings_plots_local <- NULL - multi_pca_top_loadings_plots_html_local <- NULL - multi_pca_loading_distribution_plots_local <- list() - multi_pca_loading_distribution_plots_html_local <- list() - # -------- PCA (all numeric features, per-sample medians) -------- - # 1) Select all numeric columns from the cell summary - num_cols <- names(multi)[sapply(multi, function(x) is.numeric(x) && !all(is.na(x)))] - # 2) Aggregate per-sample medians across all numeric features - agg_median <- multi %>% - group_by(sampleID) %>% - summarise(across(all_of(num_cols), ~ median(., na.rm = TRUE)), .groups = "drop") - - if (nrow(agg_median) >= 2 && ncol(agg_median) >= 2) { - # 3) Drop features with zero variance across samples - feat_sds <- sapply(agg_median %>% select(-sampleID), function(x) stats::sd(x, na.rm = TRUE)) - feat_keep <- names(feat_sds)[is.finite(feat_sds) & !is.na(feat_sds) & feat_sds > 0] - - if (length(feat_keep) >= 2) { - mat <- as.matrix(agg_median[, feat_keep, drop = FALSE]) - rownames(mat) <- agg_median$sampleID - pca_fit <- stats::prcomp(mat, center = TRUE, scale. = TRUE) - var_expl <- (pca_fit$sdev^2) / sum(pca_fit$sdev^2) - - # A) PC1–PC2 scatter (first among PCA plots) - if (ncol(pca_fit$x) >= 2) { - scores <- as.data.frame(pca_fit$x) - scores$sampleID <- rownames(scores) - gp_scores <- ggplot(scores, aes(x = PC1, y = PC2, colour = sampleID, label = sampleID)) + - geom_point(size = 3.8, alpha = 0.95, shape = 19, stroke = 0) + - scale_color_conesa(palette = "complete") + - theme_classic(base_size = 16) + - labs( - title = "PCA Plot Based on sampleID", - x = sprintf("PC1 (%.1f%%)", 100 * var_expl[1]), - y = sprintf("PC2 (%.1f%%)", 100 * var_expl[2]) - ) + - scale_x_continuous(labels = function(x) sprintf("%.2f", x)) + - scale_y_continuous(labels = function(x) sprintf("%.2f", x)) + - theme( - legend.position = "bottom", - legend.title = element_blank(), - legend.text = element_text(size = 12), - legend.key = element_blank(), - legend.margin = margin(t = 16), - plot.title = element_text(size = 18, face = "bold", hjust = 0.5), - axis.title = element_text(size = 16), - axis.text.x = element_text(size = 14), - axis.text.y = element_text(size = 14) - ) + - guides(colour = guide_legend(override.aes = list(size = 5, alpha = 0.95, stroke = 0))) - multi_pca_scores_plot_local <- gp_scores - assign("multi_pca_scores_plot", gp_scores, envir = .GlobalEnv) - - sample_levels_pca <- sample_levels_global[sample_levels_global %in% scores$sampleID] - if (length(sample_levels_pca) == 0) { - sample_levels_pca <- unique(scores$sampleID) - } - palette_cols_pca <- get_conesa_palette_colors(length(sample_levels_pca), palette = "complete") - sample_color_map_pca <- setNames(palette_cols_pca, sample_levels_pca) - scores_plotly <- plotly::plot_ly() - for (sample_nm in sample_levels_pca) { - sample_df <- scores %>% filter(sampleID == sample_nm) - if (nrow(sample_df) == 0) next - scores_plotly <- scores_plotly %>% plotly::add_trace( - data = sample_df, - x = ~PC1, - y = ~PC2, - type = "scatter", - mode = "markers", - name = sample_nm, - text = ~sampleID, - hovertemplate = "Sample: %{text}
PC1: %{x:.2f}
PC2: %{y:.2f}", - marker = list(size = 12, color = sample_color_map_pca[[sample_nm]], line = list(width = 0)) - ) - } - scores_plotly <- scores_plotly %>% - plotly::layout( - title = list(text = "PCA Plot Based on sampleID", font = list(size = 20, family = "Arial")), - xaxis = list( - title = list(text = sprintf("PC1 (%.1f%%)", 100 * var_expl[1]), font = list(size = 16, family = "Arial")), - tickfont = list(size = 14, family = "Arial"), standoff = 12, automargin = TRUE - ), - yaxis = list( - title = list(text = sprintf("PC2 (%.1f%%)", 100 * var_expl[2]), font = list(size = 16, family = "Arial")), - tickfont = list(size = 14, family = "Arial"), standoff = 10, automargin = TRUE - ), - legend = list(orientation = "h", x = 0.5, xanchor = "center", y = -0.25, yanchor = "top", title = list(text = ""), font = list(size = 14, family = "Arial")), - paper_bgcolor = "rgba(0,0,0,0)", - plot_bgcolor = "rgba(0,0,0,0)", - font = list(family = "Arial", size = 16), - margin = list(t = 60, b = 120, l = 90, r = 60), - height = 520 - ) - multi_pca_scores_plot_html_local <- scores_plotly - assign("multi_pca_scores_plot_html", scores_plotly, envir = .GlobalEnv) - } - - # B) Scree plot (second) - k <- min(length(var_expl), 10) - scree_df <- data.frame( - PC = factor(paste0("PC", seq_len(k)), levels = paste0("PC", seq_len(k))), - Proportion = var_expl[seq_len(k)], - Cumulative = cumsum(var_expl)[seq_len(k)] - ) - gp_scree <- ggplot(scree_df, aes(x = PC)) + - geom_col(aes(y = Proportion, fill = "Proportion"), width = 0.8, colour = NA) + - geom_point(aes(y = Cumulative, colour = "Cumulative"), size = 2.2) + - geom_line(aes(y = Cumulative, colour = "Cumulative", group = 1), linewidth = 0.6) + - scale_fill_manual(values = c("Proportion" = "#6BAED6"), name = "") + - scale_color_manual(values = c("Cumulative" = "#4D4D4D"), name = "") + - theme_classic(base_size = 16) + - labs(title = "PCA scree plot", y = "Variance explained", x = "Principal component") + - theme( - plot.title = element_text(size = 18, face = "bold", hjust = 0.5), - axis.title = element_text(size = 16), - axis.text.x = element_text(size = 14), - axis.text.y = element_text(size = 14), - legend.position = "bottom", - legend.margin = margin(t = 20) - ) - multi_pca_scree_plot_local <- gp_scree - assign("multi_pca_scree_plot", gp_scree, envir = .GlobalEnv) - - scree_plotly <- plotly::plot_ly( - scree_df, - x = ~PC, - y = ~Proportion, - type = "bar", - name = "Proportion", - marker = list(color = "#6BAED6") - ) %>% - plotly::add_trace( - y = ~Cumulative, - type = "scatter", - mode = "lines+markers", - name = "Cumulative", - hovertemplate = "PC: %{x}
Cumulative: %{y:.3f}", - line = list(color = "#4D4D4D", width = 2), - marker = list(color = "#4D4D4D", size = 9, line = list(width = 0)) - ) %>% - plotly::layout( - title = list(text = "PCA scree plot", font = list(size = 18, family = "Arial")), - yaxis = list( - title = list(text = "Variance explained", font = list(size = 16, family = "Arial")), tickfont = list(size = 14), - standoff = 6, automargin = TRUE - ), - xaxis = list( - title = list(text = "Principal component", font = list(size = 16, family = "Arial")), tickfont = list(size = 14), - standoff = 14, automargin = TRUE - ), - legend = list(orientation = "h", x = 0.5, xanchor = "center", y = -0.25, yanchor = "top", title = list(text = "")), - paper_bgcolor = "rgba(0,0,0,0)", - plot_bgcolor = "rgba(0,0,0,0)", - margin = list(t = 60, b = 120, l = 80, r = 40) - ) - multi_pca_scree_plot_html_local <- scree_plotly - assign("multi_pca_scree_plot_html", scree_plotly, envir = .GlobalEnv) - - # C) Top loadings for PC1 and PC2 (third) - if (ncol(pca_fit$rotation) >= 2) { - rot <- as.data.frame(pca_fit$rotation) - rot$variable <- rownames(rot) - top_n <- 10L - pick_top <- function(colname) { - ord <- order(abs(rot[[colname]]), decreasing = TRUE) - head(rot[ord, c("variable", colname)], top_n) - } - top_pc1 <- pick_top("PC1") - colnames(top_pc1) <- c("variable", "loading") - top_pc1 <- top_pc1 %>% - mutate( - variable = as.character(variable), - PC = "PC1", - rank = dplyr::row_number(), - sign = if_else(loading >= 0, "Positive", "Negative"), - abs_loading = abs(loading) - ) - top_pc2 <- pick_top("PC2") - colnames(top_pc2) <- c("variable", "loading") - top_pc2 <- top_pc2 %>% - mutate( - variable = as.character(variable), - PC = "PC2", - rank = dplyr::row_number(), - sign = if_else(loading >= 0, "Positive", "Negative"), - abs_loading = abs(loading) - ) - - top_pc1_plot <- top_pc1 - top_pc2_plot <- top_pc2 - top_pc1_plot$variable <- factor(top_pc1_plot$variable, levels = rev(top_pc1_plot$variable)) - top_pc2_plot$variable <- factor(top_pc2_plot$variable, levels = rev(top_pc2_plot$variable)) - top_pc1_plot$sign <- factor(top_pc1_plot$sign, levels = c("Positive", "Negative")) - top_pc2_plot$sign <- factor(top_pc2_plot$sign, levels = c("Positive", "Negative")) - - gp_load1 <- ggplot(top_pc1_plot, aes(x = variable, y = abs_loading, fill = sign)) + - geom_col(width = 0.7) + - coord_flip() + - scale_fill_manual(values = c("Positive" = "#78C679", "Negative" = "#EE6A50"), name = "Sign", limits = c("Positive", "Negative"), drop = FALSE) + - theme_classic(base_size = 16) + - labs(title = "Top 10 loadings: PC1", x = "Feature", y = "Absolute loading") + - theme( - plot.title = element_text(size = 18, face = "bold", hjust = 0.5), - axis.title = element_text(size = 16), - axis.text.x = element_text(size = 14), - axis.text.y = element_text(size = 14), - legend.position = "bottom" - ) - gp_load2 <- ggplot(top_pc2_plot, aes(x = variable, y = abs_loading, fill = sign)) + - geom_col(width = 0.7) + - coord_flip() + - scale_fill_manual(values = c("Positive" = "#78C679", "Negative" = "#EE6A50"), name = "Sign", limits = c("Positive", "Negative"), drop = FALSE) + - theme_classic(base_size = 16) + - labs(title = "Top 10 loadings: PC2", x = "Feature", y = "Absolute loading") + - theme( - plot.title = element_text(size = 18, face = "bold", hjust = 0.5), - axis.title = element_text(size = 16), - axis.text.x = element_text(size = 14), - axis.text.y = element_text(size = 14), - legend.position = "bottom" - ) - loadings_plots <- list(PC1 = gp_load1, PC2 = gp_load2) - multi_pca_top_loadings_plots_local <- loadings_plots - assign("multi_pca_top_loadings_plots", loadings_plots, envir = .GlobalEnv) - - pc1_plot_html <- plotly::plot_ly( - top_pc1_plot, - x = ~abs_loading, - y = ~variable, - color = ~sign, - colors = c("Positive" = "#78C679", "Negative" = "#EE6A50"), - type = "bar", - orientation = "h", - customdata = ~sign, - hovertemplate = "Feature: %{y}
|loading|: %{x:.3f}
Sign: %{customdata}" - ) %>% - plotly::layout( - title = list(text = "Top 10 loadings: PC1", font = list(size = 18, family = "Arial")), - xaxis = list( - title = list(text = "Absolute loading", font = list(size = 16, family = "Arial")), - tickfont = list(size = 14), - automargin = TRUE, - standoff = 10 - ), - yaxis = list( - title = list(text = "Feature", font = list(size = 16, family = "Arial")), - tickfont = list(size = 14), - automargin = TRUE, - standoff = 6 - ), - legend = list( - orientation = "h", - x = 0.5, - xanchor = "center", - y = -0.25, - yanchor = "top", - font = list(size = 14, family = "Arial"), - title = list(text = "Sign") - ), - paper_bgcolor = "rgba(0,0,0,0)", - plot_bgcolor = "rgba(0,0,0,0)", - margin = list(t = 60, b = 110, l = 190, r = 70), - font = list(family = "Arial", size = 14), - height = 460 - ) - - pc2_plot_html <- plotly::plot_ly( - top_pc2_plot, - x = ~abs_loading, - y = ~variable, - color = ~sign, - colors = c("Positive" = "#78C679", "Negative" = "#EE6A50"), - type = "bar", - orientation = "h", - customdata = ~sign, - hovertemplate = "Feature: %{y}
|loading|: %{x:.3f}
Sign: %{customdata}" - ) %>% - plotly::layout( - title = list(text = "Top 10 loadings: PC2", font = list(size = 18, family = "Arial")), - xaxis = list( - title = list(text = "Absolute loading", font = list(size = 16, family = "Arial")), - tickfont = list(size = 14), - automargin = TRUE, - standoff = 10 - ), - yaxis = list( - title = list(text = "", font = list(size = 16, family = "Arial")), - tickfont = list(size = 14), - automargin = TRUE, - standoff = 6 - ), - legend = list( - orientation = "h", - x = 0.5, - xanchor = "center", - y = -0.25, - yanchor = "top", - font = list(size = 14, family = "Arial"), - title = list(text = "Sign") - ), - paper_bgcolor = "rgba(0,0,0,0)", - plot_bgcolor = "rgba(0,0,0,0)", - margin = list(t = 60, b = 110, l = 190, r = 70), - font = list(family = "Arial", size = 14), - height = 460 - ) - - loadings_plots_html <- list(PC1 = pc1_plot_html, PC2 = pc2_plot_html) - multi_pca_top_loadings_plots_html_local <- loadings_plots_html - assign("multi_pca_top_loadings_plots_html", loadings_plots_html, envir = .GlobalEnv) - - positive_col <- "#78C679" - negative_col <- "#EE6A50" - pc1_levels <- rev(as.character(top_pc1_plot$variable)) - pc2_levels <- rev(as.character(top_pc2_plot$variable)) - - combined_loadings_html <- plotly::plot_ly() - for (panel in c("PC1", "PC2")) { - axis_suffix <- if (panel == "PC1") "" else "2" - panel_df <- if (panel == "PC1") top_pc1_plot else top_pc2_plot - panel_df <- panel_df %>% mutate(variable = as.character(variable)) - for (sgn in c("Positive", "Negative")) { - sgn_df <- panel_df %>% filter(sign == sgn) - if (nrow(sgn_df) == 0) next - combined_loadings_html <- combined_loadings_html %>% - plotly::add_trace( - data = sgn_df, - x = ~abs_loading, - y = ~variable, - type = "bar", - orientation = "h", - name = sgn, - legendgroup = sgn, - showlegend = (panel == "PC1"), - marker = list(color = if (sgn == "Positive") positive_col else negative_col), - hovertemplate = paste0("Feature: %{y}
|loading|: %{x:.3f}
Sign: ", sgn, ""), - xaxis = paste0("x", axis_suffix), - yaxis = paste0("y", axis_suffix) - ) - } - } - - combined_loadings_html <- combined_loadings_html %>% - plotly::layout( - barmode = "stack", - xaxis = list( - domain = c(0, 0.35), - title = list(text = "Absolute loading", font = list(size = 14, family = "Arial")), - tickfont = list(size = 14), - automargin = TRUE, - standoff = 12 - ), - yaxis = list( - domain = c(0, 1), - title = list(text = "Feature", standoff = 30, automargin = TRUE, font = list(size = 14, family = "Arial")), - tickfont = list(size = 14), - automargin = TRUE, - categoryorder = "array", - categoryarray = pc1_levels - ), - xaxis2 = list( - domain = c(0.65, 1), - title = list(text = "Absolute loading", font = list(size = 14, family = "Arial")), - tickfont = list(size = 14), - automargin = TRUE, - standoff = 12, - anchor = "y2" - ), - yaxis2 = list( - domain = c(0, 1), - title = list(text = "", font = list(size = 14, family = "Arial")), - tickfont = list(size = 14), - automargin = TRUE, - categoryorder = "array", - categoryarray = pc2_levels, - anchor = "x2" - ), - legend = list( - orientation = "h", - x = 0.5, - xanchor = "center", - y = -0.25, - yanchor = "top", - font = list(size = 14, family = "Arial"), - title = list(text = "Sign") - ), - annotations = list( - list(text = "Top 10 loadings: PC1", x = 0.13, y = 1.08, xref = "paper", yref = "paper", showarrow = FALSE, font = list(size = 16, family = "Arial")), - list(text = "Top 10 loadings: PC2", x = 0.9, y = 1.08, xref = "paper", yref = "paper", showarrow = FALSE, font = list(size = 16, family = "Arial")) - ), - paper_bgcolor = "rgba(0,0,0,0)", - plot_bgcolor = "rgba(0,0,0,0)", - font = list(family = "Arial", size = 14), - margin = list(t = 60, b = 140, l = 220, r = 160), - height = 640 - ) - assign("multi_pca_top_loadings_combined_html", combined_loadings_html, envir = .GlobalEnv) - - # D) Distribution plots for top-loading features on PC1/PC2 - sample_levels <- sample_levels_global - loading_plot_info <- bind_rows(top_pc1, top_pc2) %>% - distinct(variable, .keep_all = TRUE) - loading_distribution_plots <- list() - loading_distribution_plots_html <- list() - if (nrow(loading_plot_info) > 0) { - for (idx in seq_len(nrow(loading_plot_info))) { - gp_loading <- build_loading_feature_plot(multi, loading_plot_info[idx, ], sample_levels) - feat_name <- loading_plot_info$variable[idx] - if (is.null(gp_loading)) { - message(sprintf("[INFO] Skipping PCA loading feature %s due to missing or constant data.", feat_name)) - } else { - loading_distribution_plots[[feat_name]] <- gp_loading$ggplot - loading_distribution_plots_html[[feat_name]] <- gp_loading$plotly - } - } - } - multi_pca_loading_distribution_plots_local <- loading_distribution_plots - multi_pca_loading_distribution_plots_html_local <- loading_distribution_plots_html - assign("multi_pca_loading_distribution_plots", loading_distribution_plots, envir = .GlobalEnv) - assign("multi_pca_loading_distribution_plots_html", loading_distribution_plots_html, envir = .GlobalEnv) - } - } - } - - if (render_pdf) { - pdf(pdf_out, paper = "a4r", width = 14, height = 11) - grid.newpage() - title_text <- if (params$mode == "isoforms") "SQANTI-single cell\nmulti-sample isoforms report" else "SQANTI-single cell\nmulti-sample reads report" - cover <- textGrob(title_text, - gp = gpar(fontface = "italic", fontsize = 40, col = "orangered") - ) - grid.draw(cover) - - tbl_theme <- ttheme_default( - core = list(fg_params = list(cex = 1.4, hjust = 0.5, x = 0.5)), - colhead = list(fg_params = list(cex = 1.4, fontface = "bold", hjust = 0.5, x = 0.5)) - ) - tbl_grob <- tableGrob(summary_tbl, rows = NULL, theme = tbl_theme) - title_grob <- textGrob("Per cell summary of samples", gp = gpar(fontface = "italic", fontsize = 28)) - grid.newpage() - pushViewport(viewport(x = 0.5, y = 0.95)) - grid.draw(title_grob) - popViewport() - pushViewport(viewport(x = 0.5, y = 0.5)) - grid.draw(tbl_grob) - popViewport() - - if (have_cats) { - for (gp in category_plots) { - print(gp) - } - print(p_cats) - } - - if (!is.null(multi_pca_scores_plot_local)) { - print(multi_pca_scores_plot_local) - } - if (!is.null(multi_pca_scree_plot_local)) { - print(multi_pca_scree_plot_local) - } - if (!is.null(multi_pca_top_loadings_plots_local)) { - gp_load1 <- multi_pca_top_loadings_plots_local[["PC1"]] - gp_load2 <- multi_pca_top_loadings_plots_local[["PC2"]] - if (!is.null(gp_load1) && !is.null(gp_load2)) { - legend_df <- data.frame( - variable = c("pos", "neg"), - abs_loading = c(1, 1), - sign = factor(c("Positive", "Negative"), levels = c("Positive", "Negative")) - ) - legend_plot <- ggplot(legend_df, aes(x = variable, y = abs_loading, fill = sign)) + - geom_col() + - scale_fill_manual(values = c("Positive" = "#78C679", "Negative" = "#EE6A50"), name = "Sign", limits = c("Positive", "Negative"), drop = FALSE) + - theme_void(base_size = 14) + - theme(legend.position = "bottom") - legend_grob <- gtable::gtable_filter(ggplotGrob(legend_plot), "guide-box") - row_plots <- arrangeGrob(gp_load1 + theme(legend.position = "none"), gp_load2 + theme(legend.position = "none"), ncol = 2) - grid.arrange(row_plots, legend_grob, ncol = 1, heights = c(0.86, 0.14)) - } else { - for (plt in multi_pca_top_loadings_plots_local) { - print(plt) - } - } - } - if (length(multi_pca_loading_distribution_plots_local) > 0) { - for (nm in names(multi_pca_loading_distribution_plots_local)) { - print(multi_pca_loading_distribution_plots_local[[nm]]) - } - } - - dev.off() - message(sprintf("**** Multisample report written: %s", pdf_out)) - } - - if (render_html) { - cmd_args <- commandArgs(trailingOnly = FALSE) - script_arg <- cmd_args[grep("--file=", cmd_args)] - if (length(script_arg) > 0) { - script_path <- substring(script_arg, 8L) - script_dir <- dirname(normalizePath(script_path)) - } else { - script_dir <- getwd() - } - - rmd_file <- file.path(script_dir, "SQANTI-sc_multisample_report.Rmd") - css_file <- file.path(script_dir, "style-multisample.css") - html_output_file <- file.path(out_dir, paste0(params$prefix, ".html")) - - if (!file.exists(rmd_file)) { - stop("HTML report template not found: ", rmd_file) - } - - if (file.exists(css_file)) { - file.copy(css_file, dirname(html_output_file), overwrite = TRUE) - } - - - - rmarkdown::render( - rmd_file, - output_file = html_output_file, - envir = globalenv(), - quiet = TRUE - ) - - # Cleanup: remove the copied CSS file - css_output <- file.path(dirname(html_output_file), basename(css_file)) - if (file.exists(css_output)) { - file.remove(css_output) - } - - message("HTML report generated: ", html_output_file) - } -} - -main() +#!/usr/env/bin Rscript + + # ##### SQANTI single-cell multisample report generation ##### + +### Author: Carlos Blanco + +#********************** Packages + +# !/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(dplyr) + library(tidyr) + library(ggplot2) + library(gridExtra) + library(grid) + library(ggdist) + library(stringr) + library(rmarkdown) +}) + +# Ensure RColorConesa is available; if not, install from GitHub, then load +if (!requireNamespace("RColorConesa", quietly = TRUE)) { + if (!requireNamespace("devtools", quietly = TRUE)) { + install.packages("devtools") + } + devtools::install_github("ConesaLab/RColorConesa") +} +library(RColorConesa) + +parse_args <- function() { + args <- commandArgs(trailingOnly = TRUE) + # Simple flag parser: expects --key value pairs, and a single string for --files (comma-separated) + res <- list(files = NULL, out_dir = ".", mode = "reads", report = "pdf", prefix = "SQANTI_sc_multi_report") + i <- 1 + while (i <= length(args)) { + key <- args[i] + if (startsWith(key, "--")) { + k <- substring(key, 3) + if (k %in% c("files", "out_dir", "mode", "report", "prefix")) { + if (i + 1 <= length(args)) { + res[[k]] <- args[i + 1] + i <- i + 2 + next + } else { + stop(sprintf("Missing value for flag %s", key)) + } + } else { + stop(sprintf("Unknown flag: %s", key)) + } + } else { + stop(sprintf("Unexpected argument: %s", key)) + } + } + if (is.null(res$files) || !nzchar(res$files)) { + stop("--files must be provided (comma-separated list of cell summary files)") + } + if (!(res$report %in% c("pdf", "html", "both"))) { + stop("--report must be one of: pdf, html, both") + } + res +} + +safe_read_summary <- function(fpath) { + # read.table supports gz automatically + df <- tryCatch( + { + read.table(fpath, header = TRUE, sep = "\t", stringsAsFactors = FALSE, check.names = FALSE) + }, + error = function(e) { + message(sprintf("[ERROR] Failed to read summary %s: %s", fpath, e$message)) + return(NULL) + } + ) + if (is.null(df)) { + return(NULL) + } + if (ncol(df) < 2 || !("CB" %in% colnames(df))) { + message(sprintf("[WARNING] Summary %s does not have expected structure; skipping", fpath)) + return(NULL) + } + # Coerce numeric columns (col 2..n) to numeric + if (ncol(df) >= 2) { + for (j in 2:ncol(df)) { + df[[j]] <- suppressWarnings(as.numeric(df[[j]])) + } + } + # Derive sampleID from filename: _SQANTI_cell_summary.txt.gz + base <- basename(fpath) + sample <- sub("_SQANTI_cell_summary\\.txt(\\.gz)?$", "", base) + df$sampleID <- sample + df +} + +# Helper: tidy feature names for titles and subtitles +format_feature_display_name <- function(feature) { + cleaned <- feature + suffixes <- c( + "_prop_in_cell$", "_perc_in_cell$", "_prop$", "_perc$", + "_percentage$", "_pct$", "_ratio$", "_count$", "_counts$", + "_in_cell$", "_per_cell$", "_value$" + ) + for (pattern in suffixes) { + cleaned <- gsub(pattern, "", cleaned, ignore.case = TRUE) + } + cleaned <- gsub("([0-9]+)b", "\\1 bp", cleaned, ignore.case = TRUE) + cleaned <- stringr::str_replace_all(cleaned, "_+", " ") + cleaned <- stringr::str_squish(cleaned) + if (cleaned == "") cleaned <- feature + tokens <- unlist(strsplit(cleaned, " ", fixed = FALSE)) + if (length(tokens) == 0) { + return(feature) + } + replacements <- c( + "Fsm" = "FSM", + "Ism" = "ISM", + "Nic" = "NIC", + "Nnc" = "NNC", + "Rts" = "RTS", + "Tss" = "TSS", + "Nmd" = "NMD", + "Cage" = "CAGE", + "Ujcs" = "UJCs", + "Ujc" = "UJC", + "Umis" = "UMIs", + "Umi" = "UMI", + "Mt" = "MT", + "Cb" = "CB", + "Pc" = "PC", + "Qc" = "QC", + "Orf" = "ORF", + "Tpm" = "TPM" + ) + normalize_token <- function(tok) { + if (tok == "") { + return(tok) + } + if (grepl("-", tok, fixed = TRUE)) { + parts <- strsplit(tok, "-", fixed = TRUE)[[1]] + parts <- vapply(parts, normalize_token, character(1), USE.NAMES = FALSE) + return(paste(parts, collapse = "-")) + } + if (tok == toupper(tok)) { + return(tok) + } + if (grepl("^[0-9]+(\\.[0-9]+)?$", tok)) { + return(tok) + } + if (grepl("^[0-9]+bp$", tok, ignore.case = TRUE)) { + return(gsub("bp$", "bp", tok, ignore.case = TRUE)) + } + key <- stringr::str_to_title(tok) + if (key %in% names(replacements)) { + return(replacements[[key]]) + } + stringr::str_to_title(tok) + } + tokens <- vapply(tokens, normalize_token, character(1), USE.NAMES = FALSE) + stringr::str_squish(paste(tokens, collapse = " ")) +} + +# Helper: derive axis labels and scaling behaviour from feature metadata +infer_feature_metadata <- function(feature, values) { + name_lower <- tolower(feature) + domain <- "Value" + if (stringr::str_detect(name_lower, "length")) domain <- "Length" + if (stringr::str_detect(name_lower, "length") && stringr::str_detect(name_lower, "prop|perc|pct|ratio|fraction")) { + if (exists("params") && params$mode == "isoforms") domain <- "Transcripts" else domain <- "Reads" + } + if (stringr::str_detect(name_lower, "read")) { + if (exists("params") && params$mode == "isoforms") domain <- "Transcripts" else domain <- "Reads" + } + if (stringr::str_detect(name_lower, "\\bmt\\b") || stringr::str_detect(name_lower, "^mt_")) domain <- "Reads" + if (stringr::str_detect(name_lower, "exon")) domain <- "Exons" + if (stringr::str_detect(name_lower, "intron")) domain <- "Introns" + if (stringr::str_detect(name_lower, "coverage")) domain <- "Coverage" + if (stringr::str_detect(name_lower, "isoform")) domain <- "Isoforms" + if (stringr::str_detect(name_lower, "transcript")) domain <- "Transcripts" + if (stringr::str_detect(name_lower, "gene")) domain <- "Genes" + if (stringr::str_detect(name_lower, "umi")) domain <- "UMIs" + if (stringr::str_detect(name_lower, "junction")) domain <- "Junctions" + structural_keywords <- c( + "fsm", "ism", "nic", "nnc", + "genic_genomic", "genic genomic", "genic", + "antisense", "fusion", "intergenic", + "genic_intron", "genic intron" + ) + if (any(stringr::str_detect(name_lower, structural_keywords))) { + if (exists("params") && params$mode == "isoforms") domain <- "Transcripts" else domain <- "Reads" + } + is_prop_keyword <- stringr::str_detect(name_lower, "prop|perc|pct|ratio|fraction") + finite_vals <- values[is.finite(values)] + unit <- if (is_prop_keyword) "%" else "count" + scale_to_percent <- FALSE + if (length(finite_vals) > 0) { + maxv <- max(finite_vals) + minv <- min(finite_vals) + if (unit == "%" && maxv <= 1.5) { + scale_to_percent <- TRUE + } + if (unit == "count" && !is_prop_keyword && maxv <= 1.5 && minv >= 0) { + unit <- "%" + scale_to_percent <- TRUE + } + if (domain == "Length" && unit == "count") { + unit <- "bp" + } + } else { + if (unit == "%") scale_to_percent <- TRUE + } + if (unit == "%" && domain == "Value") { + domain <- "Percentage" + } + value_label <- switch(unit, + "%" = if (domain %in% c("Percentage", "Value")) "Value, %" else sprintf("%s, %%", domain), + "bp" = sprintf("%s, bp", domain), + sprintf("%s, count", domain) + ) + if (domain == "Value" && unit == "count") { + value_label <- "Value" + } + value_label <- infer_junction_display_label(feature, value_label) + list( + display_name = format_feature_display_name(feature), + value_label = value_label, + scale_to_percent = scale_to_percent, + unit = unit, + domain = domain + ) +} + +infer_junction_display_label <- function(feature_name, current_label) { + lower_name <- tolower(feature_name) + junction_keywords <- c("junction", "junctions", "splice", "sj", "canonical", "noncanonical", "ujc", "ujcs") + contains_junction <- any(vapply(junction_keywords, function(kw) grepl(kw, lower_name, fixed = TRUE), logical(1))) + if (!contains_junction) { + return(current_label) + } + if (grepl("junct", current_label, ignore.case = TRUE)) { + return(current_label) + } + if (grepl("%", current_label, fixed = TRUE)) { + return(sub("%", " (junctions, %)", current_label, fixed = TRUE)) + } + if (grepl("count", current_label, ignore.case = TRUE) || grepl("junction", current_label, ignore.case = TRUE)) { + return(current_label) + } + paste0(current_label, ", junctions") +} + +get_conesa_palette_colors <- function(n, palette = "complete") { + if (n <= 0) { + return(character(0)) + } + + col_vec <- NULL + if (requireNamespace("RColorConesa", quietly = TRUE)) { + col_vec <- tryCatch( + { + scale_obj <- RColorConesa::scale_fill_conesa(palette = palette) + if (!is.null(scale_obj$palette) && is.function(scale_obj$palette)) { + scale_obj$palette(n) + } else { + NULL + } + }, + error = function(e) NULL + ) + + if (is.null(col_vec) || length(col_vec) == 0) { + col_vec <- tryCatch( + { + ns <- asNamespace("RColorConesa") + if (exists("conesa_palettes", envir = ns, inherits = FALSE)) { + pal_list <- get("conesa_palettes", envir = ns, inherits = FALSE) + pal_entry <- pal_list[[palette]] + if (is.function(pal_entry)) { + pal_entry(n) + } else if (is.vector(pal_entry)) { + unname(pal_entry) + } else { + NULL + } + } else { + NULL + } + }, + error = function(e) NULL + ) + } + + if ((is.null(col_vec) || length(col_vec) == 0) && exists("palette_conesa", envir = asNamespace("RColorConesa"), inherits = FALSE)) { + pal_fun <- get("palette_conesa", envir = asNamespace("RColorConesa"), inherits = FALSE) + col_vec <- tryCatch(pal_fun(palette, n), error = function(e) NULL) + } + } + + if (is.null(col_vec) || length(col_vec) == 0) { + fallback <- c( + "#6BAED6", "#FC8D59", "#78C679", "#EE6A50", "#969696", + "#66C2A4", "#FFD92F", "#E78AC3", "#A6D854", "#8DA0CB", + "#E5C494", "#B3B3B3" + ) + col_vec <- fallback + } + + rep_len(col_vec, n) +} + +# Helper: build per-feature violin + boxplot for a PCA loading +build_loading_feature_plot <- function(multi, feature_info, sample_levels) { + feature_name <- as.character(feature_info$variable)[1] + if (!feature_name %in% colnames(multi)) { + return(NULL) + } + values <- multi[[feature_name]] + if (!is.numeric(values)) { + return(NULL) + } + plot_df <- multi %>% + select(sampleID, value = all_of(feature_name)) %>% + mutate(value = as.numeric(value)) %>% + filter(is.finite(value)) + if (nrow(plot_df) == 0) { + return(NULL) + } + info <- infer_feature_metadata(feature_name, plot_df$value) + if (info$scale_to_percent) { + plot_df <- plot_df %>% mutate(value = value * 100) + } + if (length(sample_levels) == 0) { + sample_levels <- unique(plot_df$sampleID) + } + plot_df <- plot_df %>% mutate(sampleID = factor(sampleID, levels = sample_levels)) + uniqueness <- plot_df %>% + group_by(sampleID) %>% + summarise(unique_vals = n_distinct(value), .groups = "drop") + use_violin <- any(uniqueness$unique_vals > 1) + loading_value <- as.numeric(feature_info$loading)[1] + loading_rank <- as.integer(feature_info$rank)[1] + pc_label <- as.character(feature_info$PC)[1] + gp <- ggplot(plot_df, aes(x = sampleID, y = value, fill = sampleID, colour = sampleID)) + if (use_violin) { + gp <- gp + geom_violin(trim = TRUE, scale = "width", alpha = 0.7, linewidth = 0.3) + } + gp <- gp + + geom_boxplot( + width = 0.05, outlier.shape = NA, + alpha = 0.3, + colour = "grey20" + ) + + stat_summary( + fun = mean, geom = "point", shape = 4, size = 1, + colour = "red", stroke = 0.45 + ) + + scale_fill_conesa(palette = "complete", drop = FALSE) + + scale_color_conesa(palette = "complete", guide = "none", drop = FALSE) + + labs( + title = sprintf("Per Sample %s Distribution Across Cells", info$display_name), + subtitle = sprintf("%s loading rank #%d (loading = %.3f)", pc_label, loading_rank, loading_value), + x = "Sample", + y = info$value_label + ) + + theme_classic(base_size = 16) + + theme( + legend.position = "none", + plot.title = element_text(size = 18, face = "bold", hjust = 0.5), + plot.subtitle = element_text(size = 12, hjust = 0.5), + axis.title = element_text(size = 16), + axis.text.x = element_text(size = 14, angle = 35, hjust = 1), + axis.text.y = element_text(size = 14) + ) + return(gp) +} + +main <- function() { + params <- parse_args() + + files <- unlist(strsplit(params$files, ",", fixed = TRUE)) + files <- trimws(files) + files <- files[nchar(files) > 0] + if (length(files) < 2) { + message("[INFO] Fewer than 2 files provided. Nothing to do.") + quit(status = 0) + } + + # Read all summaries + lst <- lapply(files, safe_read_summary) + lst <- Filter(Negate(is.null), lst) + if (length(lst) < 2) { + message("[INFO] Fewer than 2 valid summaries after reading. Skipping.") + quit(status = 0) + } + + # Harmonize columns: union of all names; fill missing with 0 for numeric, "" otherwise + all_cols <- Reduce(union, lapply(lst, colnames)) + # Ensure CB and sampleID exist in final order front + all_cols <- unique(c("CB", setdiff(all_cols, "CB"))) + all_cols <- unique(c(all_cols, "sampleID")) + + norm_list <- lapply(lst, function(df) { + missing <- setdiff(all_cols, colnames(df)) + for (m in missing) { + df[[m]] <- if (m == "CB" || m == "sampleID") "" else 0 + } + # Reorder + df <- df[, all_cols] + df + }) + multi <- bind_rows(norm_list) + + sample_levels_global <- unique(multi$sampleID[!is.na(multi$sampleID)]) + + render_pdf <- params$report %in% c("pdf", "both") + render_html <- params$report %in% c("html", "both") + + # Basic cohort-level aggregates per sample + count_col <- if (params$mode == "isoforms") "Transcripts_in_cell" else "Reads_in_cell" + + per_sample_stats <- multi %>% + group_by(sampleID) %>% + summarise( + cells = n_distinct(CB), + mean_reads = mean(.data[[count_col]], na.rm = TRUE), + mean_umis = if ("UMIs_in_cell" %in% names(.)) mean(UMIs_in_cell, na.rm = TRUE) else NA, + median_reads = median(.data[[count_col]], na.rm = TRUE), + mean_genes = mean(Genes_in_cell, na.rm = TRUE), + median_genes = median(Genes_in_cell, na.rm = TRUE), + mean_annotated = mean(Annotated_genes, na.rm = TRUE), + mean_novel = mean(Novel_genes, na.rm = TRUE), + mean_ujc = if ("UJCs_in_cell" %in% names(.)) mean(UJCs_in_cell, na.rm = TRUE) else NA, + median_ujc = if ("UJCs_in_cell" %in% names(.)) median(UJCs_in_cell, na.rm = TRUE) else NA, + mean_mt = mean(MT_perc, na.rm = TRUE) + ) + + entity_label_plural <- if (params$mode == "isoforms") "Transcripts" else "Reads" + + summary_tbl <- per_sample_stats %>% + mutate(across(where(is.numeric), ~ round(., 3))) %>% + transmute( + Sample = sampleID, + `Cell\nBarcodes` = cells, + `Average\nReads` = mean_reads, + `Average\nUMIs` = mean_umis, + `Average\nAnnotated\nGenes` = mean_annotated, + `Average\nNovel\nGenes` = mean_novel, + `Average\nUJCs` = mean_ujc, + `Average\nMitochondrial\nReads` = mean_mt + ) + + # Rename columns dynamically + colnames(summary_tbl)[colnames(summary_tbl) == "Average\nReads"] <- paste0("Average\n", entity_label_plural) + colnames(summary_tbl)[colnames(summary_tbl) == "Average\nMitochondrial\nReads"] <- paste0("Average\nMitochondrial\n", entity_label_plural) + + if (params$mode == "isoforms") { + summary_tbl <- summary_tbl %>% select(-`Average\nUMIs`, -`Average\nUJCs`) + } + + summary_tbl_html <- summary_tbl + colnames(summary_tbl_html) <- gsub("\\n", "
", colnames(summary_tbl_html), fixed = TRUE) + + assign("multi_per_sample_stats", per_sample_stats, envir = .GlobalEnv) + assign("multi_summary_tbl_pdf", summary_tbl, envir = .GlobalEnv) + assign("multi_summary_tbl_html", summary_tbl_html, envir = .GlobalEnv) + assign("entity_label", if (params$mode == "isoforms") "Transcript" else "Read", envir = .GlobalEnv) + assign("entity_label_plural", entity_label_plural, envir = .GlobalEnv) + assign("mode", params$mode, envir = .GlobalEnv) + + # Output path + out_dir <- params$out_dir + if (!dir.exists(out_dir)) { + dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) + } + pdf_out <- file.path(out_dir, paste0(params$prefix, ".pdf")) + + if (render_html) { + message("[INFO] HTML report requested; plots will be rendered via R Markdown template.") + } + + # Structural category proportions, if available + cat_cols <- c("FSM_prop", "ISM_prop", "NIC_prop", "NNC_prop", "Genic_Genomic_prop", "Antisense_prop", "Fusion_prop", "Intergenic_prop", "Genic_intron_prop") + have_cats <- all(cat_cols %in% colnames(multi)) + if (have_cats) { + cats_long <- multi %>% + select(all_of(c("sampleID", cat_cols))) %>% + pivot_longer(cols = all_of(cat_cols), names_to = "category", values_to = "prop") %>% + mutate( + sampleID = factor(sampleID, levels = unique(multi$sampleID)), + category = factor(category, + levels = c( + "FSM_prop", "ISM_prop", "NIC_prop", "NNC_prop", + "Genic_Genomic_prop", "Antisense_prop", "Fusion_prop", "Intergenic_prop", "Genic_intron_prop" + ), + labels = c( + "FSM", "ISM", "NIC", "NNC", + "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic intron" + ) + ) + ) %>% + mutate(prop = suppressWarnings(as.numeric(prop))) + + p_cats <- ggplot(cats_long, aes(x = category, y = prop, fill = sampleID, colour = sampleID, group = sampleID)) + + ggdist::stat_slabinterval( + side = "left", + position = position_dodge(width = 0.6), + density = "unbounded", + bw = "nrd0", + normalize = "groups", scale = 0.85, adjust = 2.5, trim = TRUE, + show_point = FALSE, show_interval = FALSE, # slab only (fill only) + slab_colour = NA, alpha = 0.7 + ) + + ggdist::stat_slabinterval( + side = "left", + position = position_dodge(width = 0.6), + density = "unbounded", + bw = "nrd0", + normalize = "groups", scale = 0.85, adjust = 2.5, trim = TRUE, + show_point = FALSE, show_interval = FALSE, # outline-only + mapping = aes(slab_colour = after_scale(colour)), + fill = NA, slab_linewidth = 0.05, show.legend = FALSE + ) + + # Draw medians as short horizontal lines centered within each dodged slab + stat_summary( + data = cats_long, aes(group = sampleID), + fun = median, fun.min = median, fun.max = median, + geom = "crossbar", width = 0.1, + position = position_dodge(width = 0.6), + color = "black", linewidth = 0.1, alpha = 1 + ) + + # Add a small lower expansion so slabs don't touch the x-axis + scale_y_continuous(limits = c(0, 100), expand = expansion(add = c(1, 0))) + + scale_fill_conesa(palette = "complete") + + scale_color_conesa(palette = "complete", guide = "none") + + guides( + fill = guide_legend(override.aes = list(shape = 15, size = 5, alpha = 0.95, colour = NA, stroke = 0)), + linetype = "none", alpha = "none", size = "none", colour = "none" + ) + + theme_classic(base_size = 12) + + labs( + title = "Structural Category Proportions by Sample", + x = "Structural category", y = "Reads, %" + ) + + theme( + legend.position = "bottom", + axis.text.x = element_text(angle = 45, hjust = 1, size = 14), + axis.title = element_text(size = 16), + axis.text.y = element_text(size = 14), + plot.title = element_text(size = 20, face = "bold", hjust = 0.5) + ) + + category_levels <- levels(cats_long$category) + sample_levels_all <- levels(cats_long$sampleID) + cat_to_col <- c( + "FSM" = "#6BAED6", + "ISM" = "#FC8D59", + "NIC" = "#78C679", + "NNC" = "#EE6A50", + "Genic Genomic" = "#969696", + "Antisense" = "#66C2A4", + "Fusion" = "goldenrod1", + "Intergenic" = "darksalmon", + "Genic intron" = "#41B6C4" + ) + + category_plots <- lapply(category_levels, function(cat_lab) { + dfp <- cats_long %>% filter(category == cat_lab) + cat_col <- unname(cat_to_col[[as.character(cat_lab)]]) + if (is.null(cat_col) || is.na(cat_col)) cat_col <- "grey60" + box_outline_col <- if (as.character(cat_lab) == "Genic Genomic") "grey90" else "grey20" + violin_fill <- grDevices::adjustcolor(cat_col, alpha.f = 0.7) + ggplot(dfp, aes(x = sampleID, y = prop)) + + geom_violin(fill = violin_fill, color = cat_col, linewidth = 0.3, width = 0.8, trim = TRUE) + + geom_boxplot(width = 0.05, outlier.shape = NA, fill = cat_col, color = box_outline_col, alpha = 0.3) + + stat_summary(fun = mean, geom = "point", shape = 4, size = 1, colour = "red", stroke = 0.9) + + scale_y_continuous(limits = c(0, 100), expand = expansion(add = c(1, 0))) + + theme_classic(base_size = 14) + + labs(title = paste0("Per Sample ", cat_lab, " Reads Distribution Across Cells"), x = "Sample", y = "Reads, %") + + theme( + legend.position = "none", + axis.text.x = element_text(angle = 0, hjust = 0.5, size = 14), + plot.title = element_text(size = 18, face = "bold", hjust = 0.5), + axis.title = element_text(size = 16), + axis.text.y = element_text(size = 14) + ) + }) + names(category_plots) <- as.character(category_levels) + + assign("multi_structural_category_combined_plot", p_cats, envir = .GlobalEnv) + assign("multi_structural_category_violin_plots", category_plots, envir = .GlobalEnv) + + } + + multi_pca_scores_plot_local <- NULL + + multi_pca_scree_plot_local <- NULL + + multi_pca_top_loadings_plots_local <- NULL + + multi_pca_loading_distribution_plots_local <- list() + + # -------- PCA (all numeric features, per-sample medians) -------- + # 1) Select all numeric columns from the cell summary + num_cols <- names(multi)[sapply(multi, function(x) is.numeric(x) && !all(is.na(x)))] + # 2) Aggregate per-sample medians across all numeric features + agg_median <- multi %>% + group_by(sampleID) %>% + summarise(across(all_of(num_cols), ~ median(., na.rm = TRUE)), .groups = "drop") + + if (nrow(agg_median) >= 2 && ncol(agg_median) >= 2) { + # 3) Drop features with zero variance across samples + feat_sds <- sapply(agg_median %>% select(-sampleID), function(x) stats::sd(x, na.rm = TRUE)) + feat_keep <- names(feat_sds)[is.finite(feat_sds) & !is.na(feat_sds) & feat_sds > 0] + + if (length(feat_keep) >= 2) { + mat <- as.matrix(agg_median[, feat_keep, drop = FALSE]) + rownames(mat) <- agg_median$sampleID + pca_fit <- stats::prcomp(mat, center = TRUE, scale. = TRUE) + var_expl <- (pca_fit$sdev^2) / sum(pca_fit$sdev^2) + + # A) PC1–PC2 scatter (first among PCA plots) + if (ncol(pca_fit$x) >= 2) { + scores <- as.data.frame(pca_fit$x) + scores$sampleID <- rownames(scores) + gp_scores <- ggplot(scores, aes(x = PC1, y = PC2, colour = sampleID, label = sampleID)) + + geom_point(size = 3.8, alpha = 0.95, shape = 19, stroke = 0) + + scale_color_conesa(palette = "complete") + + theme_classic(base_size = 16) + + labs( + title = "PCA Plot Based on sampleID", + x = sprintf("PC1 (%.1f%%)", 100 * var_expl[1]), + y = sprintf("PC2 (%.1f%%)", 100 * var_expl[2]) + ) + + scale_x_continuous(labels = function(x) sprintf("%.2f", x)) + + scale_y_continuous(labels = function(x) sprintf("%.2f", x)) + + theme( + legend.position = "bottom", + legend.title = element_blank(), + legend.text = element_text(size = 12), + legend.key = element_blank(), + legend.margin = margin(t = 16), + plot.title = element_text(size = 18, face = "bold", hjust = 0.5), + axis.title = element_text(size = 16), + axis.text.x = element_text(size = 14), + axis.text.y = element_text(size = 14) + ) + + guides(colour = guide_legend(override.aes = list(size = 5, alpha = 0.95, stroke = 0))) + multi_pca_scores_plot_local <- gp_scores + assign("multi_pca_scores_plot", gp_scores, envir = .GlobalEnv) + + } + + # B) Scree plot (second) + k <- min(length(var_expl), 10) + scree_df <- data.frame( + PC = factor(paste0("PC", seq_len(k)), levels = paste0("PC", seq_len(k))), + Proportion = var_expl[seq_len(k)], + Cumulative = cumsum(var_expl)[seq_len(k)] + ) + gp_scree <- ggplot(scree_df, aes(x = PC)) + + geom_col(aes(y = Proportion, fill = "Proportion"), width = 0.8, colour = NA) + + geom_point(aes(y = Cumulative, colour = "Cumulative"), size = 2.2) + + geom_line(aes(y = Cumulative, colour = "Cumulative", group = 1), linewidth = 0.6) + + scale_fill_manual(values = c("Proportion" = "#6BAED6"), name = "") + + scale_color_manual(values = c("Cumulative" = "#4D4D4D"), name = "") + + theme_classic(base_size = 16) + + labs(title = "PCA scree plot", y = "Variance explained", x = "Principal component") + + theme( + plot.title = element_text(size = 18, face = "bold", hjust = 0.5), + axis.title = element_text(size = 16), + axis.text.x = element_text(size = 14), + axis.text.y = element_text(size = 14), + legend.position = "bottom", + legend.margin = margin(t = 20) + ) + multi_pca_scree_plot_local <- gp_scree + assign("multi_pca_scree_plot", gp_scree, envir = .GlobalEnv) + + # C) Top loadings for PC1 and PC2 (third) + if (ncol(pca_fit$rotation) >= 2) { + rot <- as.data.frame(pca_fit$rotation) + rot$variable <- rownames(rot) + top_n <- 10L + pick_top <- function(colname) { + ord <- order(abs(rot[[colname]]), decreasing = TRUE) + head(rot[ord, c("variable", colname)], top_n) + } + top_pc1 <- pick_top("PC1") + colnames(top_pc1) <- c("variable", "loading") + top_pc1 <- top_pc1 %>% + mutate( + variable = as.character(variable), + PC = "PC1", + rank = dplyr::row_number(), + sign = if_else(loading >= 0, "Positive", "Negative"), + abs_loading = abs(loading) + ) + top_pc2 <- pick_top("PC2") + colnames(top_pc2) <- c("variable", "loading") + top_pc2 <- top_pc2 %>% + mutate( + variable = as.character(variable), + PC = "PC2", + rank = dplyr::row_number(), + sign = if_else(loading >= 0, "Positive", "Negative"), + abs_loading = abs(loading) + ) + + top_pc1_plot <- top_pc1 + top_pc2_plot <- top_pc2 + top_pc1_plot$variable <- factor(top_pc1_plot$variable, levels = rev(top_pc1_plot$variable)) + top_pc2_plot$variable <- factor(top_pc2_plot$variable, levels = rev(top_pc2_plot$variable)) + top_pc1_plot$sign <- factor(top_pc1_plot$sign, levels = c("Positive", "Negative")) + top_pc2_plot$sign <- factor(top_pc2_plot$sign, levels = c("Positive", "Negative")) + + gp_load1 <- ggplot(top_pc1_plot, aes(x = variable, y = abs_loading, fill = sign)) + + geom_col(width = 0.7) + + coord_flip() + + scale_fill_manual(values = c("Positive" = "#78C679", "Negative" = "#EE6A50"), name = "Sign", limits = c("Positive", "Negative"), drop = FALSE) + + theme_classic(base_size = 16) + + labs(title = "Top 10 loadings: PC1", x = "Feature", y = "Absolute loading") + + theme( + plot.title = element_text(size = 18, face = "bold", hjust = 0.5), + axis.title = element_text(size = 16), + axis.text.x = element_text(size = 14), + axis.text.y = element_text(size = 14), + legend.position = "bottom" + ) + gp_load2 <- ggplot(top_pc2_plot, aes(x = variable, y = abs_loading, fill = sign)) + + geom_col(width = 0.7) + + coord_flip() + + scale_fill_manual(values = c("Positive" = "#78C679", "Negative" = "#EE6A50"), name = "Sign", limits = c("Positive", "Negative"), drop = FALSE) + + theme_classic(base_size = 16) + + labs(title = "Top 10 loadings: PC2", x = "Feature", y = "Absolute loading") + + theme( + plot.title = element_text(size = 18, face = "bold", hjust = 0.5), + axis.title = element_text(size = 16), + axis.text.x = element_text(size = 14), + axis.text.y = element_text(size = 14), + legend.position = "bottom" + ) + loadings_plots <- list(PC1 = gp_load1, PC2 = gp_load2) + multi_pca_top_loadings_plots_local <- loadings_plots + assign("multi_pca_top_loadings_plots", loadings_plots, envir = .GlobalEnv) + + # D) Distribution plots for top-loading features on PC1/PC2 + sample_levels <- sample_levels_global + loading_plot_info <- bind_rows(top_pc1, top_pc2) %>% + distinct(variable, .keep_all = TRUE) + loading_distribution_plots <- list() + loading_distribution_plots_html <- list() + if (nrow(loading_plot_info) > 0) { + for (idx in seq_len(nrow(loading_plot_info))) { + gp_loading <- build_loading_feature_plot(multi, loading_plot_info[idx, ], sample_levels) + feat_name <- loading_plot_info$variable[idx] + if (is.null(gp_loading)) { + message(sprintf("[INFO] Skipping PCA loading feature %s due to missing or constant data.", feat_name)) + } else { + loading_distribution_plots[[feat_name]] <- gp_loading + + } + } + } + multi_pca_loading_distribution_plots_local <- loading_distribution_plots + + assign("multi_pca_loading_distribution_plots", loading_distribution_plots, envir = .GlobalEnv) + + } + } + } + + if (render_pdf) { + pdf(pdf_out, paper = "a4r", width = 14, height = 11) + grid.newpage() + title_text <- if (params$mode == "isoforms") "SQANTI-single cell\nmulti-sample isoforms report" else "SQANTI-single cell\nmulti-sample reads report" + cover <- textGrob(title_text, + gp = gpar(fontface = "italic", fontsize = 40, col = "orangered") + ) + grid.draw(cover) + + tbl_theme <- ttheme_default( + core = list(fg_params = list(cex = 1.4, hjust = 0.5, x = 0.5)), + colhead = list(fg_params = list(cex = 1.4, fontface = "bold", hjust = 0.5, x = 0.5)) + ) + tbl_grob <- tableGrob(summary_tbl, rows = NULL, theme = tbl_theme) + title_grob <- textGrob("Per cell summary of samples", gp = gpar(fontface = "italic", fontsize = 28)) + grid.newpage() + pushViewport(viewport(x = 0.5, y = 0.95)) + grid.draw(title_grob) + popViewport() + pushViewport(viewport(x = 0.5, y = 0.5)) + grid.draw(tbl_grob) + popViewport() + + if (have_cats) { + for (gp in category_plots) { + print(gp) + } + print(p_cats) + } + + if (!is.null(multi_pca_scores_plot_local)) { + print(multi_pca_scores_plot_local) + } + if (!is.null(multi_pca_scree_plot_local)) { + print(multi_pca_scree_plot_local) + } + if (!is.null(multi_pca_top_loadings_plots_local)) { + gp_load1 <- multi_pca_top_loadings_plots_local[["PC1"]] + gp_load2 <- multi_pca_top_loadings_plots_local[["PC2"]] + if (!is.null(gp_load1) && !is.null(gp_load2)) { + legend_df <- data.frame( + variable = c("pos", "neg"), + abs_loading = c(1, 1), + sign = factor(c("Positive", "Negative"), levels = c("Positive", "Negative")) + ) + legend_plot <- ggplot(legend_df, aes(x = variable, y = abs_loading, fill = sign)) + + geom_col() + + scale_fill_manual(values = c("Positive" = "#78C679", "Negative" = "#EE6A50"), name = "Sign", limits = c("Positive", "Negative"), drop = FALSE) + + theme_void(base_size = 14) + + theme(legend.position = "bottom") + legend_grob <- gtable::gtable_filter(ggplotGrob(legend_plot), "guide-box") + row_plots <- arrangeGrob(gp_load1 + theme(legend.position = "none"), gp_load2 + theme(legend.position = "none"), ncol = 2) + grid.arrange(row_plots, legend_grob, ncol = 1, heights = c(0.86, 0.14)) + } else { + for (plt in multi_pca_top_loadings_plots_local) { + print(plt) + } + } + } + if (length(multi_pca_loading_distribution_plots_local) > 0) { + for (nm in names(multi_pca_loading_distribution_plots_local)) { + print(multi_pca_loading_distribution_plots_local[[nm]]) + } + } + + dev.off() + message(sprintf("**** Multisample report written: %s", pdf_out)) + } + + if (render_html) { + cmd_args <- commandArgs(trailingOnly = FALSE) + script_arg <- cmd_args[grep("--file=", cmd_args)] + if (length(script_arg) > 0) { + script_path <- substring(script_arg, 8L) + script_dir <- dirname(normalizePath(script_path)) + } else { + script_dir <- getwd() + } + + rmd_file <- file.path(script_dir, "SQANTI-sc_multisample_report.Rmd") + css_file <- file.path(script_dir, "style-multisample.css") + html_output_file <- file.path(out_dir, paste0(params$prefix, ".html")) + + if (!file.exists(rmd_file)) { + stop("HTML report template not found: ", rmd_file) + } + + if (file.exists(css_file)) { + file.copy(css_file, dirname(html_output_file), overwrite = TRUE) + } + + rmarkdown::render( + rmd_file, + output_file = html_output_file, + envir = globalenv(), + quiet = TRUE + ) + + # Cleanup: remove the copied CSS file + css_output <- file.path(dirname(html_output_file), basename(css_file)) + if (file.exists(css_output)) { + file.remove(css_output) + } + + message("HTML report generated: ", html_output_file) + } +} + +main() diff --git a/src/report_assets/SQANTI-sc_multisample_report.Rmd b/src/report_assets/SQANTI-sc_multisample_report.Rmd index 135ba03..47bec28 100644 --- a/src/report_assets/SQANTI-sc_multisample_report.Rmd +++ b/src/report_assets/SQANTI-sc_multisample_report.Rmd @@ -17,71 +17,10 @@ output: ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -library(plotly) library(dplyr) library(htmltools) - -prepare_plot <- function(p, width = 1000, height = 600, responsive = FALSE, - title_pad = NULL, margin = NULL) { - if (is.null(p) || !inherits(p, "htmlwidget")) { - return(p) - } - - `%||%` <- function(a, b) if (is.null(a)) b else a - - standard_margin <- list(t = 80, b = 120, l = 100, r = 80) - legend_defaults <- list( - orientation = "h", - x = 0.5, - xanchor = "center", - y = -0.25, - yanchor = "top", - bgcolor = "rgba(0,0,0,0)", - bordercolor = "rgba(0,0,0,0)" - ) - title_pad_defaults <- list(l = 0, r = 0, t = 12, b = 12) - axis_tickfont_defaults <- list(size = 14) - axis_title_defaults <- list(standoff = 12) - axis_title_font_defaults <- list(size = 16) - - final_margin <- standard_margin - if (!is.null(margin)) { - final_margin <- utils::modifyList(final_margin, margin) - } - p <- plotly::layout(p, width = width, height = height) - p$x$layout$margin <- final_margin - - title_entry <- p$x$layout$title %||% list() - title_entry$pad <- utils::modifyList(title_pad_defaults, title_entry$pad %||% list()) - if (!is.null(title_pad)) { - title_entry$pad <- utils::modifyList(title_entry$pad, title_pad) - } - p$x$layout$title <- title_entry - - existing_legend <- p$x$layout$legend %||% list() - legend_entry <- utils::modifyList(existing_legend, legend_defaults) - if (is.null(existing_legend$title) || is.null(existing_legend$title$text)) { - legend_entry$title <- utils::modifyList(legend_entry$title %||% list(), list(text = "")) - } - legend_entry$font <- utils::modifyList(legend_entry$font %||% list(), list(size = 14)) - p$x$layout$legend <- legend_entry - - axis_names <- names(p$x$layout) - axis_names <- axis_names[grepl("^[xy]axis(\\d+)?$", axis_names)] - for (axis_name in axis_names) { - axis_entry <- p$x$layout[[axis_name]] %||% list() - axis_entry$automargin <- TRUE - axis_entry$tickfont <- utils::modifyList(axis_entry$tickfont %||% list(), axis_tickfont_defaults) - axis_entry$title <- utils::modifyList(axis_entry$title %||% list(), axis_title_defaults) - axis_entry$title$font <- utils::modifyList(axis_entry$title$font %||% list(), axis_title_font_defaults) - p$x$layout[[axis_name]] <- axis_entry - } - - plotly::config(p, displaylogo = FALSE, responsive = responsive) -} ``` - # Samples Per-Cell Summary ```{r summary-table} @@ -100,7 +39,7 @@ if (exists("multi_summary_tbl_html", envir = .GlobalEnv)) { ```{r structural-per-category, results='asis'} if (exists("multi_structural_category_violin_plots", envir = .GlobalEnv)) { plot_list <- get("multi_structural_category_violin_plots", envir = .GlobalEnv) - + if (length(plot_list) > 0) { # Define category order category_order <- c("FSM", "ISM", "NIC", "NNC", "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron") @@ -108,12 +47,12 @@ if (exists("multi_structural_category_violin_plots", envir = .GlobalEnv)) { matched <- match(category_order, names(plot_list)) # Filter and reorder, keep only those that exist plot_list <- plot_list[na.omit(matched)] - + # Create tabs for each category for (name in names(plot_list)) { - cat('\n### ', name, '\n') + cat("\n### ", name, "\n") print(plot_list[[name]]) - cat('\n') + cat("\n") } } else { cat("No per-category structural plots available.") @@ -134,21 +73,8 @@ if (exists("multi_structural_category_violin_plots", envir = .GlobalEnv)) {
```{r pca-scores} -if (exists("multi_pca_scores_plot_html", envir = .GlobalEnv)) { - prepare_plot( - get("multi_pca_scores_plot_html", envir = .GlobalEnv), - width = 800, height = 660, - margin = list(t = 100), - title_pad = list(b = 16) - ) -} else if (exists("multi_pca_scores_plot", envir = .GlobalEnv)) { - plt <- get("multi_pca_scores_plot", envir = .GlobalEnv) - prepare_plot( - plotly::ggplotly(plt, tooltip = c("x", "y", "colour", "label")), - width = 800, height = 660, - margin = list(t = 100), - title_pad = list(b = 16) - ) +if (exists("multi_pca_scores_plot", envir = .GlobalEnv)) { + print(get("multi_pca_scores_plot", envir = .GlobalEnv)) } else { cat("PCA score plot is not available.") } @@ -161,19 +87,8 @@ if (exists("multi_pca_scores_plot_html", envir = .GlobalEnv)) {
```{r pca-scree} -if (exists("multi_pca_scree_plot_html", envir = .GlobalEnv)) { - prepare_plot( - get("multi_pca_scree_plot_html", envir = .GlobalEnv), - margin = list(t = 100), - title_pad = list(b = 6) - ) -} else if (exists("multi_pca_scree_plot", envir = .GlobalEnv)) { - plt <- get("multi_pca_scree_plot", envir = .GlobalEnv) - prepare_plot( - plotly::ggplotly(plt, tooltip = c("x", "y")), - margin = list(t = 100), - title_pad = list(b = 6) - ) +if (exists("multi_pca_scree_plot", envir = .GlobalEnv)) { + print(get("multi_pca_scree_plot", envir = .GlobalEnv)) } else { cat("PCA scree plot is not available.") } @@ -190,9 +105,9 @@ if (exists("multi_pca_scree_plot_html", envir = .GlobalEnv)) { if (exists("multi_pca_top_loadings_plots", envir = .GlobalEnv)) { loadings_plots <- get("multi_pca_top_loadings_plots", envir = .GlobalEnv) for (name in names(loadings_plots)) { - cat('\n### ', name, ' Loadings\n') + cat("\n### ", name, " Loadings\n") print(loadings_plots[[name]]) - cat('\n') + cat("\n") } } @@ -201,9 +116,9 @@ if (exists("multi_pca_loading_distribution_plots", envir = .GlobalEnv)) { feature_plots <- get("multi_pca_loading_distribution_plots", envir = .GlobalEnv) if (length(feature_plots) > 0) { for (name in names(feature_plots)) { - cat('\n### ', name, '\n') + cat("\n### ", name, "\n") print(feature_plots[[name]]) - cat('\n') + cat("\n") } } } diff --git a/src/report_assets/SQANTI-sc_report.R b/src/report_assets/SQANTI-sc_report.R index 1ce17b0..6fff527 100644 --- a/src/report_assets/SQANTI-sc_report.R +++ b/src/report_assets/SQANTI-sc_report.R @@ -1,4257 +1,4014 @@ -#!/usr/env/bin Rscript - -###################################################### -##### SQANTI single-cell reads report generation ##### -###################################################### - - - -### Author: Juan Francisco Cervilla & Carlos Blanco - -#********************** Packages - -suppressWarnings(suppressPackageStartupMessages({ - library(dplyr) - library(ggplot2) - library(tidyr) - library(forcats) - library(grid) - library(gridExtra) - library(rmarkdown) - library(plotly) - library(scales) - library(data.table) -})) - -#********************** Taking arguments from python script - -args <- commandArgs(trailingOnly = TRUE) -class.file <- args[1] -junc.file <- args[2] -report.format <- args[3] -outputPathPrefix <- args[4] -mode <- args[5] - -# Initialize ignore_cell_summary flag -ignore_cell_summary <- FALSE -skipORF <- FALSE -CAGE_peak <- FALSE -polyA_motif_list <- FALSE -cell_summary_path <- NULL - -# Check for optional arguments -if (length(args) > 5) { - i <- 6 - while (i <= length(args)) { - arg <- args[i] - if (arg == "--ignore_cell_summary") { - ignore_cell_summary <- TRUE - i <- i + 1 - next - } - if (arg == "--skipORF") { - skipORF <- TRUE - i <- i + 1 - next - } - if (arg == "--CAGE_peak") { - CAGE_peak <- TRUE - i <- i + 1 - next - } - if (arg == "--polyA_motif_list") { - polyA_motif_list <- TRUE - i <- i + 1 - next - } - if (arg == "--cell_summary") { - if ((i + 1) <= length(args)) { - cell_summary_path <- args[i + 1] - i <- i + 2 - next - } else { - stop("--cell_summary requires a path argument") - } - } - if (arg == "--clustering") { - if ((i + 1) <= length(args)) { - clustering_path <- args[i + 1] - i <- i + 2 - next - } else { - stop("--clustering requires a path argument") - } - } - i <- i + 1 - } -} - -# Validate arguments -if (length(args) < 5) { - stop("Incorrect number of arguments! Required: [classification file] [junc file] [report format] [outputPathPrefix] [mode]. Abort!") -} - -if (!(report.format %in% c("pdf", "html", "both"))) { - stop("Report format needs to be: pdf, html, or both. Abort!") -} - -# Validate mode argument -if (!(mode %in% c("reads", "isoforms"))) { - stop("Mode needs to be: reads or isoforms. Abort!") -} - -# Set labels based on mode -if (mode == "isoforms") { - entity_label <- "Transcript" - entity_label_plural <- "Transcripts" -} else { - entity_label <- "Read" - entity_label_plural <- "Reads" -} - -# Lowercase versions for inline text -entity_label_lower <- tolower(entity_label) -entity_label_plural_lower <- tolower(entity_label_plural) - -# Print cell summary saving status -if (ignore_cell_summary) { - print("Cell summary table will not be saved (--ignore_cell_summary flag is active).") -} else { - print("Cell summary table will be saved.") -} - -# Call the function with the appropriate Save parameter -save_option <- ifelse(ignore_cell_summary, "N", "Y") - -# Define column names based on mode -if (mode == "isoforms") { - count_col <- "Transcripts_in_cell" - no_mono_col <- "total_transcripts_no_monoexon" -} else { - count_col <- "Reads_in_cell" - no_mono_col <- "total_reads_no_monoexon" -} - -# Generate output file names with full paths -cell_summary_output <- file.path(paste0(outputPathPrefix, "_SQANTI_cell_summary")) -report_output <- file.path(paste0(outputPathPrefix, "_SQANTI_sc_report_", mode)) -clustering_output <- file.path(dirname(outputPathPrefix), "clustering", "umap_results.csv") - -# Define standard colors -fill_color_orange <- "#CC6633" - -# Check for clustering results -gg_umap <- NULL -if (file.exists(clustering_output)) { - print(paste("Found clustering results at:", clustering_output)) - tryCatch( - { - umap_df <- read.csv(clustering_output) - umap_df$Cluster <- as.factor(umap_df$Cluster) - - gg_umap <- ggplot(umap_df, aes(x = UMAP_1, y = UMAP_2, color = Cluster)) + - geom_point(alpha = 0.6, size = 0.5) + - theme_classic() + - labs(title = "UMAP Projection", x = "UMAP 1", y = "UMAP 2") + - theme( - plot.title = element_text(size = 18, face = "bold", hjust = 0.5), - axis.title = element_text(size = 16), - axis.text.x = element_text(size = 14), - axis.text.y = element_text(size = 14), - legend.title = element_text(face = "bold"), - legend.position = "right" - ) + - guides(color = guide_legend(override.aes = list(size = 3, alpha = 1))) - }, - error = function(e) { - print(paste("Error reading clustering results:", e$message)) - } - ) -} else { - print("No clustering results found.") -} - - -# ---------------------------------------------------------------- -# Helper Functions (Global Scope) -# ---------------------------------------------------------------- - -# Helper: convert any R color (hex or named) to an rgba() string with alpha without affecting line color -to_rgba <- function(col, alpha = 1.0) { - rgb <- grDevices::col2rgb(col) - sprintf("rgba(%d,%d,%d,%.3f)", rgb[1], rgb[2], rgb[3], alpha) -} - -# Helper: pivot selected columns to long and return factor-ordered long df -pivot_long <- function(df, cols) { - out <- pivot_longer(df, cols = all_of(cols), names_to = "Variable", values_to = "Value") %>% - select(Variable, Value) - out$Variable <- factor(out$Variable, levels = cols) - out -} - -# Helper: generic violin + box + mean-cross plot with shared theme (ggplot version for PDF) -build_violin_plot_ggplot <- function(df_long, - title, - x_labels, - fill_map, - color_map = fill_map, - x_title = "", - y_label = paste(entity_label_plural, ", %", sep = ""), - legend = FALSE, - ylim = NULL, - override_outline_vars = character(0), - violin_alpha = 0.7, - box_alpha = 0.6, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = FALSE, - box_outline_default = "grey20", - bandwidth = NULL, - adjust = 1) { - # Determine a robust bandwidth for KDE; floor to avoid bw=0 on constant data - vals <- df_long$Value - vals <- vals[is.finite(vals)] - bw_eff <- bandwidth - if (is.null(bw_eff) || !is.numeric(bw_eff) || is.na(bw_eff) || bw_eff <= 0) { - if (length(vals) >= 2) { - bw_eff <- stats::bw.nrd0(vals) * adjust - } else { - bw_eff <- NA_real_ - } - } - if (is.na(bw_eff) || bw_eff <= 0) bw_eff <- 0.1 - - # Create ggplot version for PDF output - p <- ggplot(df_long, aes(x = Variable, y = Value)) + - # Violin layer with outline rule - { - if (isTRUE(violin_outline_fill)) { - geom_violin(aes(fill = Variable, color = Variable), alpha = violin_alpha, scale = "width", show.legend = legend, bw = bw_eff, trim = TRUE) - } else { - geom_violin(aes(fill = Variable), color = "black", alpha = violin_alpha, scale = "width", show.legend = legend, bw = bw_eff, trim = TRUE) - } - } + - scale_fill_manual(values = fill_map, labels = x_labels) + - { - if (isTRUE(violin_outline_fill)) scale_color_manual(values = fill_map, guide = "none") else NULL - } + - scale_x_discrete(labels = x_labels) + - labs(title = title, x = x_title, y = y_label) + - labs(title = title, x = x_title, y = y_label) + - theme_classic(base_size = 11) + - theme( - plot.title = element_text(size = 12, face = "bold", hjust = 0.5), - axis.title = element_text(size = 12), - axis.text.y = element_text(size = 11), - axis.text.x = element_text(size = 11, angle = x_tickangle, hjust = ifelse(x_tickangle == 0, 0.5, 1)), - legend.position = if (legend) "bottom" else "none" - ) - - # Add boxplots per variable with correct outline color (grey90 overrides) - for (var in levels(df_long$Variable)) { - var_df <- df_long[df_long$Variable == var, , drop = FALSE] - box_col <- if (var %in% override_outline_vars) "grey90" else box_outline_default - p <- p + geom_boxplot( - data = var_df, - aes(x = Variable, y = Value, fill = Variable), - width = box_width, outlier.shape = NA, alpha = box_alpha, show.legend = FALSE, color = box_col, lwd = 0.3 - ) - } - - # Add mean markers on top (moved here to separate from violin layer and ensure it is on top of boxplots) - p <- p + stat_summary(fun = mean, geom = "point", shape = 4, size = 1, color = "red", stroke = 1, show.legend = FALSE) - - if (!is.null(ylim)) { - p <- p + coord_cartesian(ylim = ylim) - } - - return(p) -} - -# Helper: generic violin + box + mean-cross plot with shared theme -build_violin_plot <- function(df_long, - title, - x_labels, - fill_map, - color_map = fill_map, - x_title = "", - y_label = paste(entity_label_plural, ", %", sep = ""), - legend = FALSE, - ylim = NULL, - override_outline_vars = character(0), - violin_alpha = 0.7, - box_alpha = 0.6, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = FALSE, - box_outline_default = "grey20", - adjust = 1, - format = "plotly") { - # Store data globally for PDF generation - plot_data_key <- paste0("plot_data_", gsub("[^A-Za-z0-9]", "_", title)) - assign(plot_data_key, list( - df_long = df_long, - title = title, - x_labels = x_labels, - fill_map = fill_map, - color_map = color_map, - x_title = x_title, - y_label = y_label, - legend = legend, - ylim = ylim, - override_outline_vars = override_outline_vars, - violin_alpha = violin_alpha, - box_alpha = box_alpha, - box_width = box_width, - x_tickangle = x_tickangle, - violin_outline_fill = violin_outline_fill, - box_outline_default = box_outline_default - ), envir = .GlobalEnv) - - # If this is a percentage plot, clamp values to [0,100] so violins don't extend under/over bounds - df_plot <- df_long - if (grepl("%", y_label)) { - df_plot$Value <- pmin(pmax(df_plot$Value, 0), 100) - } else if (grepl("count", y_label, ignore.case = TRUE)) { - df_plot$Value <- pmax(df_plot$Value, 0) - } - - # Compute shared bandwidth for KDE across both HTML and PDF (with smoothing adjust) - valid_vals <- df_plot$Value[is.finite(df_plot$Value)] - bw_shared <- if (length(valid_vals) >= 2) stats::bw.nrd0(valid_vals) * adjust else NULL - - # Store the clamped data for PDF generation as well (keeps parity) - assign(plot_data_key, list( - df_long = df_plot, - title = title, - x_labels = x_labels, - fill_map = fill_map, - color_map = color_map, - x_title = x_title, - y_label = y_label, - legend = legend, - ylim = ylim, - override_outline_vars = override_outline_vars, - violin_alpha = violin_alpha, - box_alpha = box_alpha, - box_width = box_width, - x_tickangle = x_tickangle, - violin_outline_fill = violin_outline_fill, - box_outline_default = box_outline_default, - bandwidth = bw_shared, - adjust = adjust - ), envir = .GlobalEnv) - - # If ggplot format is requested, return it directly - if (format == "ggplot") { - return(build_violin_plot_ggplot( - df_long = df_plot, - title = title, - x_labels = x_labels, - fill_map = fill_map, - color_map = color_map, - x_title = x_title, - y_label = y_label, - legend = legend, - ylim = ylim, - override_outline_vars = override_outline_vars, - violin_alpha = violin_alpha, - box_alpha = box_alpha, - box_width = box_width, - x_tickangle = x_tickangle, - violin_outline_fill = violin_outline_fill, - box_outline_default = box_outline_default, - bandwidth = bw_shared - )) - } - - # Create plotly plot directly with explicit x-axis positioning - p <- plot_ly() - - # Get all unique levels and create numeric positions - all_levels <- levels(df_plot$Variable) - if (is.null(all_levels)) { - all_levels <- unique(as.character(df_plot$Variable)) - } - - # Create numeric x positions for each level - x_positions <- seq_along(all_levels) - names(x_positions) <- all_levels - - # Determine Plotly tick angle so labels finish at the tick mark (matching PDF hjust = 1) - tick_angle_plotly <- if (!is.null(x_tickangle) && is.finite(x_tickangle) && x_tickangle != 0) x_tickangle else 0 - tick_label_position <- if (tick_angle_plotly == 0) "outside" else "outside right" - - # Add violin traces first (they will be in the background) - for (i in seq_along(all_levels)) { - var <- all_levels[i] - var_data <- df_plot[df_plot$Variable == var, ] - - # Skip if no data for this level - if (nrow(var_data) == 0) next - - line_col <- if (isTRUE(violin_outline_fill)) fill_map[var] else "black" - fill_rgba <- to_rgba(fill_map[var], violin_alpha) - - p <- p %>% add_trace( - x = rep(x_positions[var], nrow(var_data)), - y = var_data$Value, - type = "violin", - side = "both", - name = x_labels[i], - fillcolor = fill_rgba, - line = list(color = to_rgba(line_col, 1.0), width = 0.6), - spanmode = "hard", - bandwidth = bw_shared, - points = FALSE, - showlegend = FALSE, - box = list(visible = FALSE), - meanline = list(visible = FALSE), - scalemode = "width", - width = 0.8 - ) - } - - # Add boxplot traces second (they will be on top) - for (i in seq_along(all_levels)) { - var <- all_levels[i] - var_data <- df_plot[df_plot$Variable == var, ] - - # Determine box outline color - box_color <- if (var %in% override_outline_vars) "grey90" else box_outline_default - fill_rgba_box <- to_rgba(fill_map[var], box_alpha) - - p <- p %>% add_trace( - x = rep(x_positions[var], nrow(var_data)), - y = var_data$Value, - type = "box", - name = paste(x_labels[i], "Box"), - fillcolor = fill_rgba_box, - line = list(color = to_rgba(box_color, 1.0), width = 0.8), - showlegend = FALSE, - boxpoints = FALSE, - width = box_width - ) - } - - # Add mean points last (on top of everything) - mean_data <- df_plot %>% - group_by(Variable) %>% - summarise(mean_value = mean(Value, na.rm = TRUE), .groups = "drop") - - for (i in seq_along(all_levels)) { - var <- all_levels[i] - mean_row <- mean_data[mean_data$Variable == var, ] - - # Skip if no data for this level - if (nrow(mean_row) == 0) next - - p <- p %>% add_trace( - x = x_positions[var], - y = mean_row$mean_value, - type = "scatter", - mode = "markers", - name = paste(x_labels[i], "Mean"), - marker = list( - symbol = "x-thin-open", - size = 6, - color = "red", - line = list(width = 1.5, color = "red") - ), - showlegend = FALSE - ) - } - - # Store the data key as an attribute for PDF conversion - attr(p, "plot_data_key") <- plot_data_key - - # Configure layout with explicit tick positions and labels - html_title <- paste0("", gsub("\n", "
", title), "
") - p <- p %>% layout( - title = list(text = html_title, font = list(size = 22), x = 0.5, xanchor = "center"), - xaxis = list( - title = x_title, - tickmode = "array", - tickvals = x_positions, - ticktext = x_labels, - tickangle = tick_angle_plotly, - ticklabelposition = tick_label_position, - tickfont = list(size = 20), - showline = TRUE, - linecolor = "black", - linewidth = 1, - zeroline = FALSE, - range = c(min(x_positions) - 0.5, max(x_positions) + 0.5) - ), - yaxis = list( - title = y_label, - titlefont = list(size = 20), - tickfont = list(size = 18), - showline = TRUE, - linecolor = "black", - linewidth = 1, - zeroline = FALSE - ), - showlegend = legend, - paper_bgcolor = "rgba(0,0,0,0)", - plot_bgcolor = "rgba(0,0,0,0)", - font = list(family = "Arial", size = 18), - margin = list(t = 110, l = 80, r = 80, b = ifelse(x_tickangle == 0, 60, 90)) - ) - - # Apply y-axis limits if specified - if (!is.null(ylim)) { - p <- p %>% layout(yaxis = list(range = ylim)) - } else if (grepl("%", y_label)) { - p <- p %>% layout(yaxis = list(range = c(0, 100))) - } else if (grepl("count", y_label, ignore.case = TRUE)) { - max_y <- suppressWarnings(max(df_plot$Value, na.rm = TRUE)) - if (!is.finite(max_y)) max_y <- 1 - p <- p %>% layout(yaxis = list(range = c(0, max_y * 1.05))) - } - - return(p) -} - -# Helper: convert plotly object back to ggplot for PDF output -plotly_to_ggplot <- function(plotly_obj) { - if (inherits(plotly_obj, "ggplot") || inherits(plotly_obj, "gg")) { - return(plotly_obj) - } - if (is.null(plotly_obj) || !inherits(plotly_obj, "plotly")) { - return(ggplot() + - labs(title = "Plot not available") + - theme_minimal()) - } - - # Check if this plotly object has stored data - plot_data_key <- attr(plotly_obj, "plot_data_key") - if (!is.null(plot_data_key) && exists(plot_data_key, envir = .GlobalEnv)) { - plot_data <- get(plot_data_key, envir = .GlobalEnv) - - # Use the stored data to create a proper ggplot - return(build_violin_plot_ggplot( - df_long = plot_data$df_long, - title = plot_data$title, - x_labels = plot_data$x_labels, - fill_map = plot_data$fill_map, - color_map = plot_data$color_map, - x_title = if (!is.null(plot_data$x_title)) plot_data$x_title else "", - y_label = plot_data$y_label, - legend = plot_data$legend, - ylim = plot_data$ylim, - override_outline_vars = plot_data$override_outline_vars, - violin_alpha = if (!is.null(plot_data$violin_alpha)) plot_data$violin_alpha else 0.7, - box_alpha = if (!is.null(plot_data$box_alpha)) plot_data$box_alpha else 0.6, - box_width = if (!is.null(plot_data$box_width)) plot_data$box_width else 0.05, - x_tickangle = if (!is.null(plot_data$x_tickangle)) plot_data$x_tickangle else 45, - violin_outline_fill = isTRUE(plot_data$violin_outline_fill), - box_outline_default = if (!is.null(plot_data$box_outline_default)) plot_data$box_outline_default else "grey20", - bandwidth = plot_data$bandwidth - )) - } - - # Check if this is a grouped plot built via build_grouped_violin_plot - grouped_info <- attr(plotly_obj, "grouped_data") - if (!is.null(grouped_info)) { - df <- grouped_info$df - # Ensure factor levels - df$bin <- factor(df$bin, levels = grouped_info$bin_levels) - df$group <- factor(df$group, levels = names(grouped_info$fill_map)) - - # Compute effective bandwidth for grouped PDF - vals <- df$value - vals <- vals[is.finite(vals)] - bw_eff <- grouped_info$bandwidth - if (is.null(bw_eff) || !is.numeric(bw_eff) || is.na(bw_eff) || bw_eff <= 0) { - if (length(vals) >= 2) { - bw_eff <- stats::bw.nrd0(vals) - } else { - bw_eff <- NA_real_ - } - } - if (is.na(bw_eff) || bw_eff <= 0) bw_eff <- 0.1 - - p <- ggplot(df, aes(x = bin, y = value, fill = group)) + - # Violin outlines should match fill color - geom_violin(aes(color = group), - alpha = if (!is.null(grouped_info$violin_alpha)) grouped_info$violin_alpha else 0.7, - position = position_dodge(width = if (!is.null(grouped_info$dodge_width)) grouped_info$dodge_width else 0.8), scale = "width", show.legend = TRUE, bw = bw_eff, trim = TRUE - ) + - scale_color_manual(values = grouped_info$fill_map, guide = "none") + - geom_boxplot( - width = if (!is.null(grouped_info$box_width)) grouped_info$box_width else 0.05, - outlier.shape = NA, - alpha = if (!is.null(grouped_info$box_alpha)) grouped_info$box_alpha else 0.6, - position = position_dodge(width = if (!is.null(grouped_info$dodge_width)) grouped_info$dodge_width else 0.8), - color = "grey20", show.legend = FALSE, lwd = 0.3 - ) + - stat_summary( - fun = mean, geom = "point", shape = 4, size = 1, color = "red", stroke = 1, - position = position_dodge(width = if (!is.null(grouped_info$dodge_width)) grouped_info$dodge_width else 0.8), show.legend = FALSE - ) + - scale_fill_manual(values = grouped_info$fill_map, labels = grouped_info$legend_labels) + - labs(title = grouped_info$title, x = "", y = grouped_info$y_label) + - labs(title = grouped_info$title, x = "", y = grouped_info$y_label) + - theme_classic(base_size = 11) + - theme( - plot.title = element_text(size = 12, face = "bold", hjust = 0.5), - axis.title = element_text(size = 12), - axis.text.y = element_text(size = 11), - axis.text.x = element_text( - size = 11, angle = if (!is.null(grouped_info$x_tickangle)) grouped_info$x_tickangle else 0, - hjust = ifelse(!is.null(grouped_info$x_tickangle) && grouped_info$x_tickangle == 0, 0.5, 1) - ), - legend.position = "bottom", - legend.title = element_blank() - ) - - if (!is.null(grouped_info$ylim)) { - p <- p + coord_cartesian(ylim = grouped_info$ylim) - } - return(p) - } - - # Check if this is a faceted plot built via build_violin_plot_facets - facet_info <- attr(plotly_obj, "facet_data") - if (!is.null(facet_info)) { - df <- facet_info$df - fill_map <- facet_info$fill_map - x_labels <- facet_info$x_labels - # Ensure factors and orders - df$Variable <- factor(df$Variable, levels = names(fill_map)) - df$facet <- factor(df$facet, levels = facet_info$facet_levels) - - p <- ggplot(df, aes(x = Variable, y = Value, fill = Variable)) + - geom_violin(alpha = if (!is.null(facet_info$violin_alpha)) facet_info$violin_alpha else 0.7, scale = "width", show.legend = FALSE) + - geom_boxplot(width = if (!is.null(facet_info$box_width)) facet_info$box_width else 0.05, outlier.shape = NA, alpha = if (!is.null(facet_info$box_alpha)) facet_info$box_alpha else 0.6, show.legend = FALSE, lwd = 0.3) + - stat_summary(fun = mean, geom = "point", shape = 4, size = 1, color = "red", stroke = 1, show.legend = FALSE) + - scale_fill_manual(values = fill_map, labels = x_labels, guide = if (isTRUE(facet_info$show_legend)) guide_legend(override.aes = list(shape = NA)) else "none") + - scale_x_discrete(labels = x_labels) + - labs(title = facet_info$title, x = "", y = facet_info$y_label) + - theme_classic(base_size = 14) + - theme( - plot.title = element_text(size = 18, face = "bold", hjust = 0.5), - axis.title = element_text(size = 16), - axis.text.y = element_text(size = 14), - axis.text.x = element_text( - size = 12, angle = if (!is.null(facet_info$x_tickangle)) facet_info$x_tickangle else 45, - hjust = ifelse(!is.null(facet_info$x_tickangle) && facet_info$x_tickangle == 0, 0.5, 1) - ), - legend.position = if (isTRUE(facet_info$show_legend)) "bottom" else "none", - strip.placement = "outside", - strip.text.x = element_text(size = 16), - strip.background = element_blank() - ) + - facet_grid(. ~ facet, scales = "free_x", space = "free", switch = "x") - - if (!is.null(facet_info$ylim)) { - p <- p + coord_cartesian(ylim = facet_info$ylim) - } - return(p) - } - - # Profile fallback: build ggplot from stored profile_data - prof_info <- attr(plotly_obj, "profile_data") - if (!is.null(prof_info)) { - df <- prof_info$df - # Canonical Fusion color override - FUSION_COLOR <- "#F1C40F" - detect_fusion <- function(df) { - tryCatch( - { - (("category" %in% names(df)) && any(grepl("fusion", df$category, ignore.case = TRUE))) || - (("label" %in% names(df)) && any(grepl("fusion", df$label, ignore.case = TRUE))) - }, - error = function(e) FALSE - ) - } - lc <- if (detect_fusion(df)) FUSION_COLOR else prof_info$line_color - - # Compute x-axis break count (1..K-1, ≥K) - k_max <- if (!is.null(prof_info$k_max) && is.finite(prof_info$k_max)) prof_info$k_max else suppressWarnings(max(df$k[is.finite(df$k)], na.rm = TRUE)) - if (!is.finite(k_max) || is.na(k_max) || k_max < 2) k_max <- 20 - - # Helper to lighten HEX colors - lighten_hex <- function(hex, amount = 0.4) { - rgb <- grDevices::col2rgb(hex) - r <- as.integer(round(rgb[1] + (255 - rgb[1]) * amount)) - g <- as.integer(round(rgb[2] + (255 - rgb[2]) * amount)) - b <- as.integer(round(rgb[3] + (255 - rgb[3]) * amount)) - grDevices::rgb(r, g, b, maxColorValue = 255) - } - - # Prepare summary-line data and aesthetics so PDF mirrors HTML styling - stat_cols <- intersect(colnames(df), c("mean", "median")) - line_stats <- if (length(stat_cols)) { - df %>% - dplyr::select(k, dplyr::all_of(stat_cols)) %>% - tidyr::pivot_longer(cols = dplyr::all_of(stat_cols), names_to = "stat", values_to = "value") %>% - dplyr::filter(!is.na(value)) %>% - dplyr::mutate(stat = dplyr::recode(stat, mean = "Mean", median = "Median")) - } else { - data.frame(k = numeric(0), stat = character(0), value = numeric(0)) - } - - line_levels <- unique(line_stats$stat) - line_palette <- if (length(line_levels)) setNames(rep(lc, length(line_levels)), line_levels) else character(0) - linetype_values <- if (length(line_levels)) setNames(rep("solid", length(line_levels)), line_levels) else character(0) - if ("Median" %in% names(linetype_values)) linetype_values["Median"] <- "dotdash" - central_stat <- if ("Mean" %in% line_levels) "Mean" else if ("Median" %in% line_levels) "Median" else NULL - legend_linewidths <- if (length(line_levels)) setNames(ifelse(line_levels == "Median", 1.0, 1.2), line_levels) else numeric(0) - - tick_breaks <- seq_len(k_max) - label_last <- paste0("\u2265", k_max) - ticktexts <- c(as.character(seq_len(k_max - 1)), label_last) - - p <- ggplot(df, aes(x = k)) + - geom_ribbon(aes(ymin = q1, ymax = q3, fill = "IQR"), alpha = 0.25, show.legend = TRUE, key_glyph = "rect") + - theme(legend.position = "bottom") + - scale_y_continuous(limits = c(0, 100)) + - scale_x_continuous( - breaks = tick_breaks, - labels = ticktexts, - expand = expansion(mult = c(0.01, 0.01)) - ) + - labs(title = prof_info$title, x = "Exons", y = "% Transcripts", fill = "") + - scale_fill_manual(values = c("IQR" = lighten_hex(lc, 0.6))) + - theme_classic(base_size = 14) + - theme( - plot.title = element_text(size = 18, face = "bold", hjust = 0.5), - axis.title = element_text(size = 14), - axis.text = element_text(size = 12), - legend.position = "bottom", - legend.box = "horizontal" - ) - - # Add lines - if (nrow(line_stats) > 0) { - p <- p + geom_line(data = line_stats, aes(y = value, color = stat, linetype = stat, linewidth = stat)) + - scale_color_manual(values = line_palette, name = "") + - scale_linetype_manual(values = linetype_values, name = "") + - scale_linewidth_manual(values = legend_linewidths, name = "") - } - - return(p) - } - - return(ggplot() + - labs(title = "Plot not convertible") + - theme_minimal()) -} - -generate_sqantisc_plots <- function(SQANTI_cell_summary, Classification_file, Junctions, report_output, generate_pdf = TRUE) { - # Helper function to mix colors - mix_color <- function(col, target, amount) { - c_rgb <- col2rgb(col) - t_rgb <- col2rgb(target) - mix <- c_rgb * (1 - amount) + t_rgb * amount - rgb(mix[1], mix[2], mix[3], maxColorValue = 255) - } - - # Generate UMAP plots by structural category if UMAP exists - if (exists("gg_umap") && !is.null(gg_umap)) { - tryCatch( - { - umap_data <- gg_umap$data - - # Merge with SQANTI_cell_summary - # umap_data has 'Barcode', SQANTI_cell_summary has 'CB' - merged_umap <- inner_join(umap_data, SQANTI_cell_summary, by = c("Barcode" = "CB")) - - if (nrow(merged_umap) > 0) { - gg_umap_by_category <<- list() - - # Define categories and their colors - cat_colors <- c( - "FSM_prop" = "#6BAED6", - "ISM_prop" = "#FC8D59", - "NIC_prop" = "#78C679", - "NNC_prop" = "#EE6A50", - "Genic_Genomic_prop" = "#969696", - "Antisense_prop" = "#66C2A4", - "Fusion_prop" = "goldenrod1", - "Intergenic_prop" = "darksalmon", - "Genic_intron_prop" = "#41B6C4" - ) - - cat_labels <- c( - "FSM_prop" = "FSM", - "ISM_prop" = "ISM", - "NIC_prop" = "NIC", - "NNC_prop" = "NNC", - "Genic_Genomic_prop" = "Genic Genomic", - "Antisense_prop" = "Antisense", - "Fusion_prop" = "Fusion", - "Intergenic_prop" = "Intergenic", - "Genic_intron_prop" = "Genic Intron" - ) - - for (cat_col in names(cat_colors)) { - if (cat_col %in% colnames(merged_umap)) { - cat_color <- cat_colors[[cat_col]] - cat_label <- cat_labels[[cat_col]] - - # Calculate gradient end colors - # High percentages: darker hue of the category color (not pure black) - dark_color <- mix_color(cat_color, "black", 0.6) - # Low percentages: whiter hue but not pure white - light_color <- mix_color(cat_color, "white", 0.8) - - p <- ggplot(merged_umap, aes(x = UMAP_1, y = UMAP_2, color = .data[[cat_col]])) + - geom_point(alpha = 0.6, size = 0.5) + - theme_classic() + - labs(title = paste("UMAP - %", cat_label), x = "UMAP 1", y = "UMAP 2", color = paste0(entity_label_plural, ", %")) + - theme( - plot.title = element_text(size = 18, face = "bold", hjust = 0.5), - axis.title = element_text(size = 16), - axis.text.x = element_text(size = 14), - axis.text.y = element_text(size = 14), - legend.title = element_text(face = "bold"), - legend.position = "right", - legend.key.height = unit(3, "cm"), - legend.key.width = unit(1, "cm") # Thicker legend bar - ) + - scale_color_gradientn(colors = c(light_color, cat_color, dark_color)) + # Custom gradient - guides(color = guide_colorbar(barwidth = 2.5, barheight = 15)) # Make legend bar thicker and taller - - gg_umap_by_category[[cat_label]] <<- p - } - } - - # ---------------------------------------------------------------- - # Short Read Support by Cluster (Violin Plots) - # ---------------------------------------------------------------- - # Use the new column name: srjunctions_support_prop - if ("srjunctions_support_prop" %in% colnames(merged_umap) && sum(merged_umap$srjunctions_support_prop, na.rm = TRUE) > 0) { - gg_sr_cluster_plots <<- list() - - # Helper: Prepare data for build_violin_plot - # We need a long DF with columns: Variable (Cluster), Value (Prop) for the function - prepare_violin_data <- function(data, y_col) { - df <- data[!is.na(data[[y_col]]), c("Cluster", y_col)] - colnames(df) <- c("Variable", "Value") - df$Variable <- as.factor(df$Variable) - return(df) - } - - - # ---------------------------------------------------------------- - # Define Cluster Colors (Shared for both TSS and Short Read Coverage) - # ---------------------------------------------------------------- - unique_clusters <- levels(merged_umap$Cluster) - cluster_colors <- scales::hue_pal()(length(unique_clusters)) - names(cluster_colors) <- unique_clusters - - # ---------------------------------------------------------------- - # TSS Ratio Validated Support by Cluster (Violin Plots) - # ---------------------------------------------------------------- - if ("TSS_ratio_validated_prop" %in% colnames(merged_umap) && sum(merged_umap$TSS_ratio_validated_prop, na.rm = TRUE) > 0) { - gg_tss_cluster_plots <<- list() - - # Reuse helper if available, or redefine locally - prepare_violin_data <- function(data, y_col) { - df <- data[!is.na(data[[y_col]]), c("Cluster", y_col)] - colnames(df) <- c("Variable", "Value") - df$Variable <- as.factor(df$Variable) - return(df) - } - - # 1. All Transcripts Plot - TSS - # Use Cluster Colors (same as Junctions Coverage) - p_all_tss <- build_violin_plot( - df_long = prepare_violin_data(merged_umap, "TSS_ratio_validated_prop"), - title = "All Transcripts TSS Validation by Short Reads", - x_labels = levels(merged_umap$Cluster), - fill_map = cluster_colors, - x_title = "Cluster", - y_label = "TSS Ratio Validated, %", - x_tickangle = 0, - violin_outline_fill = TRUE, - violin_alpha = 0.7, - box_alpha = 0.3 - ) - gg_tss_cluster_plots[["All Transcripts"]] <<- p_all_tss - - # 2. Per-Category Plots - TSS - # Use Category Color for ALL clusters - for (cat_col in names(cat_colors)) { - tag <- cat_labels[[cat_col]] - prop_col <- paste0(tag, "_TSS_ratio_validated_prop") - if (tag == "Genic Genomic") prop_col <- "Genic_TSS_ratio_validated_prop" # Handle Genic weirdness if needed - if (tag == "Genic Intron") prop_col <- "Genic_intron_TSS_ratio_validated_prop" - - # Clean up tag to match column naming convention if straightforward - simple_tag <- names(cat_labels)[which(cat_labels == tag)] - simple_tag <- gsub("_prop", "", simple_tag) - prop_col <- paste0(simple_tag, "_TSS_ratio_validated_prop") - - if (prop_col %in% colnames(merged_umap)) { - # Define single color map - current_cat_color <- cat_colors[[cat_col]] - fixed_color_map <- rep(current_cat_color, length(unique_clusters)) - names(fixed_color_map) <- unique_clusters - - p_cat_tss <- build_violin_plot( - df_long = prepare_violin_data(merged_umap, prop_col), - title = paste(tag, "TSS Validation by Short Reads"), - x_labels = levels(merged_umap$Cluster), - fill_map = fixed_color_map, - x_title = "Cluster", - y_label = "TSS Ratio Validated, %", - x_tickangle = 0, - violin_outline_fill = TRUE, - violin_alpha = 0.7, - box_alpha = 0.3 - ) - gg_tss_cluster_plots[[tag]] <<- p_cat_tss - } - } - } - - # ---------------------------------------------------------------- - # Short Read Support by Cluster (Violin Plots) - # ---------------------------------------------------------------- - # 1. Global Plot - global_data <- prepare_violin_data(merged_umap, "srjunctions_support_prop") - - # Use build_violin_plot (which returns a Plotly object) - gg_sr_cluster_plots[["All Transcripts"]] <<- build_violin_plot( - df_long = global_data, - title = "All Transcripts Junction Coverage by Short Reads", - x_labels = levels(global_data$Variable), - fill_map = cluster_colors, - y_label = "Transcripts Supported, %", - legend = FALSE, - x_title = "Cluster", - x_tickangle = 0, - ylim = c(0, 100), - violin_outline_fill = TRUE, - violin_alpha = 0.7, - box_alpha = 0.3 - ) - - # 2. Per-Category Plots - # For these, we want to maintain the specific Structural Category color Scheme? - # The user said: "the colors used for the violins and boxes should be the same in each structural category and the color should be the corresponding to the structural category." - # This implies that for the FSM plot, ALL clusters should be colored with the FSM color. - - for (cat_col in names(cat_labels)) { - # cat_labels[[cat_col]] is e.g. "FSM", "Genic Genomic" - tag <- cat_labels[[cat_col]] - tag_clean <- gsub(" ", "_", tag) - # New column name format: {TAG}_srjunctions_support_prop - sr_col <- paste0(tag_clean, "_srjunctions_support_prop") - - if (sr_col %in% colnames(merged_umap)) { - cat_data <- prepare_violin_data(merged_umap, sr_col) - - # Define a single color map for all clusters based on the category color - # cat_colors[[cat_col]] gives the hex code for that category - current_cat_color <- cat_colors[[cat_col]] - fixed_color_map <- rep(current_cat_color, length(unique_clusters)) - names(fixed_color_map) <- unique_clusters - - title <- paste(tag, "Junction Coverage by Short Reads") - - gg_sr_cluster_plots[[tag]] <<- build_violin_plot( - df_long = cat_data, - title = title, - x_labels = levels(cat_data$Variable), - fill_map = fixed_color_map, # Per-category uses the category color for all clusters - y_label = "Transcripts Supported, %", - legend = FALSE, - x_title = "Cluster", - x_tickangle = 0, - ylim = c(0, 100), - violin_outline_fill = TRUE, - violin_alpha = 0.7, - box_alpha = 0.3 - ) - } - } - } - } - }, - error = function(e) { - print(paste("Error generating UMAP by category plots:", e$message)) - } - ) - } - - # ---------------------------------------------------------------- - # Helper: Build Continuous UMAP (for coloring by %) - # ---------------------------------------------------------------- - build_continuous_umap <- function(data, color_col, title, color_base = "blue") { - # Helper to mix colors (local to function to avoid dependency issues) - mix_color <- function(col, target, amount) { - c_rgb <- col2rgb(col) - t_rgb <- col2rgb(target) - mix <- c_rgb * (1 - amount) + t_rgb * amount - rgb(mix[1], mix[2], mix[3], maxColorValue = 255) - } - - # Ensure data has UMAP coords - if (!all(c("UMAP_1", "UMAP_2") %in% colnames(data))) return(NULL) - - # Filter NA - plot_data <- data[!is.na(data[[color_col]]), ] - if (nrow(plot_data) == 0) return(NULL) - - # Define gradient colors - dark_color <- mix_color(color_base, "black", 0.6) - light_color <- mix_color(color_base, "white", 0.8) - - p <- ggplot(plot_data, aes(x = UMAP_1, y = UMAP_2, color = .data[[color_col]])) + - geom_point(alpha = 0.6, size = 0.5) + - scale_color_gradientn( - colors = c(light_color, color_base, dark_color), - limits = c(0, max(plot_data[[color_col]], na.rm = TRUE)) - ) + - guides(color = guide_colorbar(barwidth = 2.5, barheight = 15)) + - labs(title = title, x = "UMAP 1", y = "UMAP 2", color = paste0(entity_label_plural, ", %")) + - theme_classic() + - theme( - plot.title = element_text(hjust = 0.5, face = "bold", size = 14), - axis.title = element_text(size = 16), - axis.text.x = element_text(size = 14), - axis.text.y = element_text(size = 14), - legend.position = "right", - legend.title = element_text(face = "bold"), - legend.key.height = unit(3, "cm"), - legend.key.width = unit(1, "cm") - ) - return(p) - } - - # ---------------------------------------------------------------- - # Short Read (SJ) Validation UMAPs - # ---------------------------------------------------------------- - gg_sr_umap_plots <<- list() - if (exists("merged_umap") && "srjunctions_support_prop" %in% colnames(merged_umap)) { - # Global - gg_sr_umap_plots[["All Transcripts"]] <<- build_continuous_umap( - merged_umap, - "srjunctions_support_prop", - "All Transcripts Junction Coverage by Short Reads", - color_base = "#cd4f39" - ) - - # Per-Category - for (cat_col in names(cat_labels)) { - tag <- cat_labels[[cat_col]] - tag_clean <- gsub(" ", "_", tag) - sr_col <- paste0(tag_clean, "_srjunctions_support_prop") - - if (sr_col %in% colnames(merged_umap)) { - gg_sr_umap_plots[[tag]] <<- build_continuous_umap( - merged_umap, - sr_col, - paste(tag, "Junction Coverage by Short Reads"), - color_base = cat_colors[[cat_col]] - ) - } - } - } - - # ---------------------------------------------------------------- - # TSS Validation UMAPs - # ---------------------------------------------------------------- - gg_tss_umap_plots <<- list() - if (exists("merged_umap") && "TSS_ratio_validated_prop" %in% colnames(merged_umap)) { - # Global - gg_tss_umap_plots[["All Transcripts"]] <<- build_continuous_umap( - merged_umap, - "TSS_ratio_validated_prop", - "All Transcripts TSS Validation by Short Reads", - color_base = "#ffc125" - ) - - # Per-Category - for (cat_col in names(cat_labels)) { - tag <- cat_labels[[cat_col]] - tag_clean <- gsub(" ", "_", tag) - # Handle special cases if any (e.g. Genic Genomic) - simple_tag <- names(cat_labels)[which(cat_labels == tag)] - simple_tag <- gsub("_prop", "", simple_tag) - tss_col <- paste0(simple_tag, "_TSS_ratio_validated_prop") - - if (tss_col %in% colnames(merged_umap)) { - gg_tss_umap_plots[[tag]] <<- build_continuous_umap( - merged_umap, - tss_col, - paste(tag, "TSS Validation by Short Reads"), - color_base = cat_colors[[cat_col]] - ) - } - } - } - - - - - - - - - - # Helper: grouped violins by bin with legend (Annotated/Novel) using plotly, rebuildable for PDF - # df must contain columns: bin, group, value - build_grouped_violin_plot <- function(df, - bin_levels, - group_levels, - title, - fill_map, - legend_labels, - y_label = "Genes, %", - ylim = c(0, 100), - violin_alpha = 0.5, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_width = 0.45, - dodge_width = 0.8, - violangap = 0.05, - violingroupgap = 0.15, - legend_title = NULL) { - # Ensure factors - df$bin <- factor(df$bin, levels = bin_levels) - df$group <- factor(df$group, levels = group_levels) - - # Store metadata for PDF reconstruction - grouped_info <- list( - df = data.frame(bin = df$bin, group = df$group, value = df$value), - bin_levels = bin_levels, - title = title, - fill_map = fill_map, - legend_labels = legend_labels, - y_label = y_label, - ylim = ylim, - violin_alpha = violin_alpha, - box_alpha = box_alpha, - box_width = box_width, - x_tickangle = x_tickangle, - violin_width = violin_width, - dodge_width = dodge_width, - violangap = violangap, - violingroupgap = violingroupgap - ) - - # Build plotly grouped violins - # Clamp to the provided ylim to avoid tails outside bounds - df_clamped <- df - df_clamped$value <- pmin(pmax(df_clamped$value, ylim[1]), ylim[2]) - - # Shared bandwidth across groups for consistent KDE - valid_vals <- df_clamped$value[is.finite(df_clamped$value)] - bw_shared <- if (length(valid_vals) >= 2) stats::bw.nrd0(valid_vals) else NULL - - p <- plot_ly() - - # Create combined x-axis labels for proper spacing - # For each bin, we'll have two positions (one for each group) - x_positions <- numeric() - x_tick_positions <- numeric() - x_tick_labels <- character() - y_vals <- numeric() - group_vals <- character() - - base_gap <- 0.3 # gap between groups within a bin - bin_gap <- 1.0 # gap between bins - - current_x <- 0 - - for (i in seq_along(bin_levels)) { - bin_label <- bin_levels[i] - - # Center position for this bin's tick - bin_center <- current_x + base_gap * (length(group_levels) - 1) / 2 - x_tick_positions <- c(x_tick_positions, bin_center) - x_tick_labels <- c(x_tick_labels, bin_label) - - for (j in seq_along(group_levels)) { - grp <- group_levels[j] - grp_df <- df_clamped[df_clamped$bin == bin_label & df_clamped$group == grp, , drop = FALSE] - - if (nrow(grp_df) > 0) { - x_pos <- current_x + (j - 1) * base_gap - x_positions <- c(x_positions, rep(x_pos, nrow(grp_df))) - y_vals <- c(y_vals, grp_df$value) - group_vals <- c(group_vals, rep(grp, nrow(grp_df))) - } - - current_x_for_trace <- current_x + (j - 1) * base_gap - fill_rgba <- to_rgba(fill_map[grp], violin_alpha) - - # Only add trace if we have data - if (nrow(grp_df) > 0) { - p <- p %>% add_trace( - x = rep(current_x_for_trace, nrow(grp_df)), - y = grp_df$value, - type = "violin", - name = legend_labels[grp], - legendgroup = grp, - scalegroup = grp, - fillcolor = fill_rgba, - line = list(color = to_rgba(fill_map[grp], 1.0), width = 0.8), - spanmode = "hard", - bandwidth = bw_shared, - side = "both", - width = violin_width, - points = FALSE, - showlegend = (i == 1), # Only show legend for first bin - box = list(visible = FALSE), - meanline = list(visible = FALSE) - ) - } - } - - current_x <- current_x + length(group_levels) * base_gap + bin_gap - } - - # Reset current_x for boxplots and mean markers - current_x <- 0 - for (i in seq_along(bin_levels)) { - bin_label <- bin_levels[i] - - for (j in seq_along(group_levels)) { - grp <- group_levels[j] - grp_df <- df_clamped[df_clamped$bin == bin_label & df_clamped$group == grp, , drop = FALSE] - - current_x_for_trace <- current_x + (j - 1) * base_gap - fill_rgba_box <- to_rgba(fill_map[grp], box_alpha) - - # Boxplots - if (nrow(grp_df) > 0) { - p <- p %>% add_trace( - x = rep(current_x_for_trace, nrow(grp_df)), - y = grp_df$value, - type = "box", - name = paste0(legend_labels[grp], " Box"), - legendgroup = grp, - fillcolor = fill_rgba_box, - line = list(color = to_rgba("grey20", 1.0), width = 1), - showlegend = FALSE, - boxpoints = FALSE, - width = box_width - ) - - # Mean markers - mean_val <- mean(grp_df$value, na.rm = TRUE) - p <- p %>% add_trace( - x = current_x_for_trace, - y = mean_val, - type = "scatter", - mode = "markers", - name = paste0(legend_labels[grp], " Mean"), - legendgroup = grp, - marker = list(symbol = "x-thin-open", size = 6, color = "red", line = list(width = 1.5, color = "red")), - showlegend = FALSE - ) - } - } - - current_x <- current_x + length(group_levels) * base_gap + bin_gap - } - - # Attach metadata for PDF - attr(p, "grouped_data") <- grouped_info <- c(grouped_info, list(bandwidth = bw_shared)) - - tick_angle_plotly <- if (!is.null(x_tickangle) && is.finite(x_tickangle) && x_tickangle != 0) x_tickangle else 0 - tick_label_position <- if (tick_angle_plotly == 0) "outside" else "outside right" - - html_title <- paste0("", gsub("\n", "
", title), "
") - # Build legend object with optional title - legend_obj <- list(orientation = "h", x = 0.5, xanchor = "center", y = -0.15, yanchor = "top") - if (!is.null(legend_title)) legend_obj$title <- list(text = legend_title) - - p <- p %>% layout( - title = list(text = html_title, font = list(size = 22), x = 0.5, xanchor = "center"), - xaxis = list( - title = "", - tickmode = "array", - tickvals = x_tick_positions, - ticktext = x_tick_labels, - tickangle = tick_angle_plotly, - ticklabelposition = tick_label_position, - tickfont = list(size = 20), - showline = TRUE, - linecolor = "black", - linewidth = 1, - zeroline = FALSE, - range = c(-0.5, max(x_tick_positions) + 1) - ), - yaxis = list( - title = y_label, titlefont = list(size = 20), tickfont = list(size = 18), range = ylim, - showline = TRUE, linecolor = "black", linewidth = 1, zeroline = FALSE - ), - violinmode = "overlay", - legend = legend_obj, - showlegend = TRUE, - paper_bgcolor = "rgba(0,0,0,0)", - plot_bgcolor = "rgba(0,0,0,0)", - font = list(family = "Arial", size = 18), - margin = list(t = 80, l = 80, r = 80, b = 120) - ) - return(p) - } - - `%||%` <- function(x, y) if (is.null(x)) y else x - assign_plot <- function(name, plot) assign(name, plot, envir = .GlobalEnv) - build_violin_from_long <- function(df_long, args) { - do.call(build_violin_plot, c(list(df_long = df_long), args)) - } - single_violin <- function(df, cfg) { - var <- cfg$column - df_long <- data.frame(Variable = factor(var, levels = var), Value = df[[var]]) - fill_map <- setNames(cfg$fill, var) - base_args <- list( - title = cfg$title, - x_labels = cfg$x_labels %||% cfg$x_label, - fill_map = fill_map, - legend = cfg$legend %||% FALSE, - format = cfg$format %||% "plotly" - ) - if (!is.null(cfg$y_label)) base_args$y_label <- cfg$y_label - plot_args <- c(base_args, cfg$plot_args %||% list()) - assign_plot(cfg$name, build_violin_from_long(df_long, plot_args)) - } - pivot_violin <- function(df, cfg) { - df_long <- pivot_long(df, cfg$columns) - fill_map <- cfg$fill_map %||% setNames(rep(cfg$fill, length(cfg$columns)), cfg$columns) - base_args <- list( - title = cfg$title, - x_labels = cfg$x_labels, - fill_map = fill_map, - legend = cfg$legend %||% FALSE, - format = cfg$format %||% "plotly" - ) - if (!is.null(cfg$y_label)) base_args$y_label <- cfg$y_label - plot_args <- c(base_args, cfg$plot_args %||% list()) - assign_plot(cfg$name, build_violin_from_long(df_long, plot_args)) - } - render_pdf_plot <- function(name, converter = plotly_to_ggplot) { - if (exists(name)) { - obj <- get(name) - print(if (is.null(converter)) obj else converter(obj)) - } - } - - # Center a ggplot on a page with reduced width - render_pdf_plot_centered <- function(name, width_frac = 0.45, converter = plotly_to_ggplot) { - if (!exists(name)) { - return(invisible(NULL)) - } - obj <- get(name) - p <- if (is.null(converter)) obj else converter(obj) - g <- if (inherits(p, "grob")) p else ggplotGrob(p) - left_right <- (1 - width_frac) / 2 - grid.arrange(nullGrob(), g, nullGrob(), widths = c(left_right, width_frac, left_right), newpage = TRUE) - } - - # Helper: build length-distribution violins for given column prefix using native plotly - # If mono=TRUE, uses *_length_mono_prop columns; otherwise *_length_prop - build_len_violin_for_prefix <- function(df, prefix, title, fill_color, box_fill = NULL, mono = FALSE, box_outline_color = "grey20", violin_alpha = 0.5, box_alpha = 0.3, violin_outline_fill = FALSE, format = "plotly") { - if (is.null(box_fill)) box_fill <- fill_color - suffix <- if (mono) "_length_mono_prop" else "_length_prop" - cols <- c( - paste0(prefix, "_250b", suffix), - paste0(prefix, "_500b", suffix), - paste0(prefix, "_short", suffix), - paste0(prefix, "_mid", suffix), - paste0(prefix, "_long", suffix) - ) - df_long <- pivot_longer(df, cols = all_of(cols), names_to = "Variable", values_to = "Value") %>% select(Variable, Value) - df_long$Variable <- factor(df_long$Variable, levels = cols) - # Clamp to [0,100] because these are proportions - df_long$Value <- pmin(pmax(df_long$Value, 0), 100) - - # Store data globally for PDF generation - plot_data_key <- paste0("plot_data_", gsub("[^A-Za-z0-9]", "_", title)) - - # Create fill and color maps for this plot - fill_map <- setNames(rep(fill_color, length(cols)), cols) - color_map <- setNames(rep(fill_color, length(cols)), cols) - x_labels <- c("0-250bp", "250-500bp", "500-1000bp", "1000-2000bp", ">2000bp") - names(x_labels) <- cols - - assign(plot_data_key, list( - df_long = df_long, - title = title, - x_labels = x_labels, - fill_map = fill_map, - color_map = color_map, - y_label = paste(entity_label_plural, ", %", sep = ""), - legend = FALSE, - ylim = NULL, - override_outline_vars = character(0), - violin_alpha = violin_alpha, - box_alpha = box_alpha, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = violin_outline_fill, - box_outline_default = box_outline_color - ), envir = .GlobalEnv) - - if (format == "ggplot") { - return(build_violin_plot_ggplot( - df_long = df_long, - title = title, - x_labels = x_labels, - fill_map = fill_map, - color_map = color_map, - x_title = "", - y_label = paste(entity_label_plural, ", %", sep = ""), - legend = FALSE, - ylim = NULL, - override_outline_vars = character(0), - violin_alpha = violin_alpha, - box_alpha = box_alpha, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = violin_outline_fill, - box_outline_default = box_outline_color - )) - } - - # Create plotly plot directly - p <- plot_ly() - - # Add violin traces first (background) - for (i in seq_along(levels(df_long$Variable))) { - var <- levels(df_long$Variable)[i] - var_data <- df_long[df_long$Variable == var, ] - - line_col <- if (isTRUE(violin_outline_fill)) fill_color else "black" - fill_rgba <- to_rgba(fill_color, violin_alpha) - p <- p %>% add_trace( - data = var_data, - x = ~Variable, - y = ~Value, - type = "violin", - side = "both", - name = c("0-250bp", "250-500bp", "500-1000bp", "1000-2000bp", ">2000bp")[i], - fillcolor = fill_rgba, - line = list(color = to_rgba(line_col, 1.0), width = 0.6), - spanmode = "hard", - points = FALSE, - showlegend = FALSE, - box = list(visible = FALSE), - meanline = list(visible = FALSE) - ) - } - - # Add boxplot traces second (on top) - for (i in seq_along(levels(df_long$Variable))) { - var <- levels(df_long$Variable)[i] - var_data <- df_long[df_long$Variable == var, ] - - # Use the box_outline_color parameter directly for the line color - fill_rgba_box <- to_rgba(if (is.null(box_fill)) fill_color else box_fill, box_alpha) - - p <- p %>% add_trace( - data = var_data, - x = ~Variable, - y = ~Value, - type = "box", - name = paste(c("0-250bp", "250-500bp", "500-1000bp", "1000-2000bp", ">2000bp")[i], "Box"), - fillcolor = fill_rgba_box, - line = list(color = to_rgba(box_outline_color, 1.0), width = 0.8), - showlegend = FALSE, - boxpoints = FALSE, - width = 0.05 - ) - } - - # Add mean points last (on top of everything) - mean_data <- df_long %>% - group_by(Variable) %>% - summarise(mean_value = mean(Value, na.rm = TRUE), .groups = "drop") - - p <- p %>% add_trace( - data = mean_data, - x = ~Variable, - y = ~mean_value, - type = "scatter", - mode = "markers", - name = "Mean", - marker = list( - symbol = "x-thin-open", - size = 6, - color = "red", - line = list(width = 1.5, color = "red") - ), - showlegend = FALSE - ) - - # Store the data key as an attribute for PDF conversion - attr(p, "plot_data_key") <- plot_data_key - - # Configure layout - html_title <- paste0("", gsub("\n", "
", title), "
") - tick_angle_plotly <- 45 - tick_label_position <- "outside right" - p <- p %>% layout( - title = list(text = html_title, font = list(size = 22), x = 0.5, xanchor = "center"), - xaxis = list( - title = "", - categoryorder = "array", - categoryarray = cols, - tickmode = "array", - tickvals = cols, - ticktext = x_labels, - tickangle = tick_angle_plotly, - ticklabelposition = tick_label_position, - tickfont = list(size = 20), - showline = TRUE, - linecolor = "black", - linewidth = 1, - zeroline = FALSE - ), - yaxis = list( - title = paste(entity_label_plural, ", %", sep = ""), - titlefont = list(size = 20), - tickfont = list(size = 18), - range = c(0, 100), - showline = TRUE, - linecolor = "black", - linewidth = 1, - zeroline = FALSE - ), - showlegend = FALSE, - paper_bgcolor = "rgba(0,0,0,0)", - plot_bgcolor = "rgba(0,0,0,0)", - font = list(family = "Arial", size = 18), - margin = list(t = 110, l = 80, r = 80, b = 100) - ) - - return(p) - } - - # Helper: build a per-category exon count profile (median + IQR across cells) - build_exon_profile_plot <- function(df_prof, title, line_color, k_max = 20, y_label = paste(entity_label_plural, ", %", sep = ""), n_cells = NULL) { - # Sanitize title and forbid subtitles - title <- tryCatch( - { - t <- gsub("(?i)cells included:[^|<>]*", "", title, perl = TRUE) - t <- gsub("(?i)]*>.*", "", t, perl = TRUE) # drop any HTML subtitle lines - trimws(gsub("\n.*", "", t)) - }, - error = function(e) title - ) - - # Canonical Fusion color - FUSION_COLOR <- "#F1C40F" - # Helper: detect Fusion profiles using title or data, case-insensitive - is_fusion_profile <- function(df, ttl) { - ttl_has <- tryCatch(!is.null(ttl) && grepl("fusion", ttl, ignore.case = TRUE), error = function(e) FALSE) - in_df <- tryCatch( - { - any(grepl("fusion", names(df), ignore.case = TRUE)) || - (("category" %in% names(df)) && any(grepl("fusion", df$category, ignore.case = TRUE))) || - (("label" %in% names(df)) && any(grepl("fusion", df$label, ignore.case = TRUE))) - }, - error = function(e) FALSE - ) - ttl_has || in_df - } - # Helpers: color utilities - hex_to_rgba <- function(hex, alpha = 0.25) { - rgb <- grDevices::col2rgb(hex) - sprintf("rgba(%d,%d,%d,%.2f)", rgb[1], rgb[2], rgb[3], alpha) - } - lighten_hex <- function(hex, amount = 0.35) { - # amount in [0,1] toward white - rgb <- grDevices::col2rgb(hex) - r <- as.integer(round(rgb[1] + (255 - rgb[1]) * amount)) - g <- as.integer(round(rgb[2] + (255 - rgb[2]) * amount)) - b <- as.integer(round(rgb[3] + (255 - rgb[3]) * amount)) - grDevices::rgb(r, g, b, maxColorValue = 255) - } - - # Override Fusion color if needed - if (is_fusion_profile(df_prof, title)) { - line_color <- FUSION_COLOR - } - - # df_prof columns: k, median, q1, q3, (optional) mean - if (is.null(df_prof) || nrow(df_prof) == 0 || all(!is.finite(df_prof$median))) { - p_empty <- plot_ly() %>% layout( - title = list(text = paste0("", title, ""), font = list(size = 18), x = 0.5, xanchor = "center"), - yaxis = list(title = y_label, range = c(0, 100)), - annotations = list( - list( - text = "No data available for this category", - showarrow = FALSE, - x = 0.5, - y = 0.5, - xref = "paper", - yref = "paper", - font = list(size = 14, color = "gray") - ) - ) - ) - return(p_empty) - } - label_last <- paste0("\u2265", k_max) # ≥K - ticktexts <- c(as.character(seq_len(k_max - 1)), label_last) - - # Choose center line: use mean if present, else median - y_center <- if (!is.null(df_prof$mean)) df_prof$mean else df_prof$median - - p <- plot_ly() - # Lower bound (q1) - p <- p %>% add_trace( - x = df_prof$k, y = df_prof$q1, - type = "scatter", mode = "lines", - line = list(color = line_color, width = 0.0001), - name = "Q1", showlegend = FALSE - ) - # Upper bound (q3) with fill to previous trace - if (!is.null(df_prof$q1) && !is.null(df_prof$q3)) { - iqr_fill <- hex_to_rgba(lighten_hex(line_color, 0.4), 0.25) - p <- p %>% plotly::add_ribbons( - x = df_prof$k, ymin = df_prof$q1, ymax = df_prof$q3, - fillcolor = iqr_fill, - line = list(color = "rgba(0,0,0,0)"), - name = "IQR", showlegend = TRUE - ) - } - # Central (mean or median) straight line + markers - p <- p %>% add_trace( - x = df_prof$k, y = y_center, - type = "scatter", mode = "lines+markers", - line = list(color = line_color, width = 2.5), - marker = list(color = line_color, size = 6), - name = if (!is.null(df_prof$mean)) "Mean" else "Median", showlegend = TRUE - ) - # If mean exists, overlay dashed median - if (!is.null(df_prof$mean) && !is.null(df_prof$median)) { - p <- p %>% add_trace( - x = df_prof$k, y = df_prof$median, - type = "scatter", mode = "lines", - line = list(color = line_color, width = 1.5, dash = "dash"), - name = "Median", showlegend = TRUE - ) - } - # Legend at bottom, horizontal (no extra annotations like 'Cells included') - p <- p %>% layout( - legend = list(orientation = "h", y = -0.2, x = 0.5, xanchor = "center") - ) - p <- p %>% layout( - title = list(text = title, font = list(size = 22), x = 0.5, xanchor = "center"), - xaxis = list( - title = paste("Exons per", entity_label), - titlefont = list(size = 20), - tickfont = list(size = 18), - tickmode = "array", tickvals = seq_len(k_max), ticktext = ticktexts, - showline = TRUE, linecolor = "black", linewidth = 1, zeroline = FALSE - ), - yaxis = list( - title = y_label, range = c(0, 100), - titlefont = list(size = 20), - tickfont = list(size = 18), - showline = TRUE, linecolor = "black", linewidth = 1, zeroline = FALSE - ), - paper_bgcolor = "rgba(0,0,0,0)", plot_bgcolor = "rgba(0,0,0,0)", - font = list(family = "Arial", size = 18), - margin = list(t = 90, l = 80, r = 60, b = 80) - ) - - # Attach metadata for PDF fallback - attr(p, "profile_data") <- list( - df = df_prof, title = title, line_color = line_color, k_max = k_max, y_label = y_label - ) - return(p) - } - - ### Basic cell informtion ### - ############################# - - single_defaults <- list( - violin_alpha = 0.5, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = FALSE, - box_outline_default = "black" - ) - common_plot_args <- list( - violin_alpha = 0.5, - box_alpha = 0.3, - box_width = 0.05, - violin_outline_fill = FALSE, - box_outline_default = "black", - adjust = 1.5 - ) - - # 1. Number of Reads Across Cells - cfg_reads <- list( - column = count_col, - name = "gg_reads_in_cells", - title = paste("Number of", entity_label_plural, "Across Cells"), - fill = "#CC6633", - y_label = paste(entity_label_plural, ", count", sep = ""), - x_label = "Cells", - plot_args = common_plot_args, - format = "ggplot" - ) - single_violin(SQANTI_cell_summary, cfg_reads) - - # 2. Number of UMIs Across Cells (only if not isoforms mode) - if (mode != "isoforms") { - cfg_umis <- list( - column = "UMIs_in_cell", - name = "gg_umis_in_cells", - title = "Number of UMIs Across Cells", - fill = "#CC6633", - y_label = "UMIs, count", - x_label = "Cells", - plot_args = common_plot_args, - format = "ggplot" - ) - single_violin(SQANTI_cell_summary, cfg_umis) - } - - # 3. Number of Genes Across Cells - cfg_genes <- list( - column = "Genes_in_cell", - name = "gg_genes_in_cells", - title = "Number of Genes Across Cells", - fill = "#CC6633", - y_label = "Genes, count", - x_label = "Cells", - plot_args = common_plot_args, - format = "ggplot" - ) - single_violin(SQANTI_cell_summary, cfg_genes) - - # 4. Number of Unique Junction Chains Across Cells - if (mode != "isoforms" && "UJCs_in_cell" %in% names(SQANTI_cell_summary) && !all(is.na(SQANTI_cell_summary$UJCs_in_cell)) && max(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE) > 0) { - cfg_ujcs <- list( - column = "UJCs_in_cell", - name = "gg_JCs_in_cell", - title = "Number of Unique Junction\nChains Across Cells", - fill = "#CC6633", - y_label = "UJCs, count", - x_label = "Cells", - plot_args = common_plot_args, - format = "ggplot" - ) - single_violin(SQANTI_cell_summary, cfg_ujcs) - } - - pivot_defaults <- list( - violin_alpha = 0.5, - box_alpha = 0.3, - box_width = 0.05, - violin_outline_fill = FALSE, - box_outline_default = "black" - ) - pivot_violin(SQANTI_cell_summary, list( - name = "gg_annotation_of_genes_in_cell", - columns = c("Annotated_genes", "Novel_genes"), - title = "Number of Known/Novel Genes Across Cells", - x_labels = c("Annotated Genes", "Novel Genes"), - y_label = paste(entity_label_plural, ", counts", sep = ""), - fill_map = c("Annotated_genes" = fill_color_orange, "Novel_genes" = fill_color_orange), - plot_args = pivot_defaults - )) - - if ("Genes_in_cell" %in% colnames(SQANTI_cell_summary)) { - SQANTI_cell_summary$Annotated_genes_perc <- ifelse( - SQANTI_cell_summary$Genes_in_cell > 0, - 100 * SQANTI_cell_summary$Annotated_genes / SQANTI_cell_summary$Genes_in_cell, - 0 - ) - SQANTI_cell_summary$Novel_genes_perc <- ifelse( - SQANTI_cell_summary$Genes_in_cell > 0, - 100 * SQANTI_cell_summary$Novel_genes / SQANTI_cell_summary$Genes_in_cell, - 0 - ) - - pivot_violin(SQANTI_cell_summary, list( - name = "gg_annotation_of_genes_percent_in_cell", - columns = c("Annotated_genes_perc", "Novel_genes_perc"), - title = "Percentage of Known/Novel Genes Across Cells", - x_labels = c("Annotated Genes", "Novel Genes"), - y_label = "Genes, %", - fill_map = c("Annotated_genes_perc" = fill_color_orange, "Novel_genes_perc" = fill_color_orange), - plot_args = pivot_defaults - )) - } - - # 5. Percentage of Reads/Transcripts from Known/Novel Genes Across Cells - # (Enabled for both reads and isoforms modes) - { - classification_valid <- Classification_file[Classification_file$CB != "unassigned" & !is.na(Classification_file$CB), ] - - if (nrow(classification_valid) > 0) { - # Function to expand FL and CB columns into a long format for correct counting per cell - expand_isoform_counts <- function(df, mode) { - if (mode == "reads") { - return(df %>% group_by(CB) %>% summarise(count = n(), .groups = "drop")) - } else { - # Isoforms mode: Each row has comma-separated FL (counts) and CB (barcodes) - # We need to split them and sum counts per barcode - - # Initialize lists to store expanded data - all_cbs <- character() - all_counts <- numeric() - - # Iterate through rows (this might be slow for huge files, but safe) - # A vectorised approach would be better if possible, but strsplit returns list - fl_list <- strsplit(as.character(df$FL), ",") - cb_list <- strsplit(as.character(df$CB), ",") - - # Check if lengths match (they should) - if (length(fl_list) != length(cb_list)) { - stop("Mismatch in row counts between FL and CB columns") - } - - # Use mapply to create a data frame of all counts - # This creates a list of data frames, one per isoform - expanded_list <- mapply(function(fl, cb) { - if (length(fl) != length(cb)) { - # Warning or skip? For now, we assume they match as per SQANTI specs - return(NULL) - } - data.frame(CB = cb, count = as.numeric(fl), stringsAsFactors = FALSE) - }, fl_list, cb_list, SIMPLIFY = FALSE) - - # Bind all tiny data frames - long_df <- do.call(rbind, expanded_list) - - # Now group by CB and sum - return(long_df %>% group_by(CB) %>% summarise(count = sum(count, na.rm = TRUE), .groups = "drop")) - } - } - - annotated_reads_per_cell <- classification_valid %>% - filter(!grepl("^novel", associated_gene)) - - annotated_reads_per_cell <- expand_isoform_counts(annotated_reads_per_cell, mode) %>% - rename(Annotated_genes_reads = count) - - novel_reads_per_cell <- classification_valid %>% - filter(grepl("^novel", associated_gene)) - - novel_reads_per_cell <- expand_isoform_counts(novel_reads_per_cell, mode) %>% - rename(Novel_genes_reads = count) - - SQANTI_cell_summary <- SQANTI_cell_summary %>% - left_join(annotated_reads_per_cell, by = "CB") %>% - left_join(novel_reads_per_cell, by = "CB") - - SQANTI_cell_summary$Annotated_genes_reads[is.na(SQANTI_cell_summary$Annotated_genes_reads)] <- 0 - SQANTI_cell_summary$Novel_genes_reads[is.na(SQANTI_cell_summary$Novel_genes_reads)] <- 0 - - # Revert to original denominator (Total Transcripts in Cell) now that numerators are correct - SQANTI_cell_summary$Annotated_reads_perc <- 100 * SQANTI_cell_summary$Annotated_genes_reads / SQANTI_cell_summary[[count_col]] - SQANTI_cell_summary$Novel_reads_perc <- 100 * SQANTI_cell_summary$Novel_genes_reads / SQANTI_cell_summary[[count_col]] - - SQANTI_cell_summary$Annotated_reads_perc <- ifelse(is.na(SQANTI_cell_summary$Annotated_reads_perc) | is.infinite(SQANTI_cell_summary$Annotated_reads_perc), 0, SQANTI_cell_summary$Annotated_reads_perc) - SQANTI_cell_summary$Novel_reads_perc <- ifelse(is.na(SQANTI_cell_summary$Novel_reads_perc) | is.infinite(SQANTI_cell_summary$Novel_reads_perc), 0, SQANTI_cell_summary$Novel_reads_perc) - - pivot_violin(SQANTI_cell_summary, list( - name = "gg_annotation_of_reads_in_cell", - columns = c("Annotated_reads_perc", "Novel_reads_perc"), - title = paste("Percentage of", entity_label_plural, "from Known/Novel Genes Across Cells"), - x_labels = c("Annotated Genes", "Novel Genes"), - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = c("Annotated_reads_perc" = fill_color_orange, "Novel_reads_perc" = fill_color_orange), - plot_args = pivot_defaults - )) - } else { - message("Warning: No valid classification data found. Skipping read expression by gene annotation plot.") - gg_annotation_of_reads_in_cell <<- plot_ly() %>% - layout( - title = paste("Percentage of", entity_label_plural, "from Known/Novel Genes Across Cells"), - annotations = list( - text = paste(entity_label, "expression by gene annotation\nnot available"), - showarrow = FALSE, - font = list(size = 16, color = "gray") - ) - ) - } - } - - single_violin(SQANTI_cell_summary, list( - name = "gg_MT_perc", - column = "MT_perc", - title = paste("Mitochondrial", entity_label_plural, "Across Cells"), - x_labels = c("Cell"), - y_label = paste(entity_label_plural, ", %", sep = ""), - fill = "#CC6633", - plot_args = list( - violin_alpha = 0.5, - box_alpha = 3, - box_width = 0.05, - x_tickangle = 45 - ) - )) - - - ### Gene Distribution by Read Count Bins (configurable gene bins) ### - #################################################################### - - # Define gene read-count bins and labels - gene_bin_label <- function(n) { - if (is.na(n)) { - return(NA_character_) - } - if (n == 1) { - return("1") - } - if (n >= 2 && n <= 5) { - return("2-5") - } - if (n >= 6 && n <= 9) { - return("6-9") - } - return(">=10") - } - gene_bin_levels <- c("1", "2-5", "6-9", ">=10") - - # Build per-cell per-gene read counts from classification - genes_by_cb <- Classification_file %>% - filter(!is.na(CB), CB != "unassigned", !is.na(associated_gene)) %>% - group_by(CB, associated_gene) %>% - summarise(reads_per_gene = n(), .groups = "drop") %>% - mutate( - gene_type = ifelse(grepl("^novel", associated_gene), "Novel", "Annotated"), - bin = vapply(reads_per_gene, gene_bin_label, character(1)) - ) %>% - filter(!is.na(bin)) - - # Percent of genes per bin within each CB and gene type - read_bins_data <- genes_by_cb %>% - group_by(CB, gene_type, bin) %>% - summarise(num_genes = n(), .groups = "drop") %>% - group_by(CB, gene_type) %>% - mutate(percentage = 100 * num_genes / sum(num_genes)) %>% - ungroup() %>% - tidyr::complete(CB, gene_type, bin = gene_bin_levels, fill = list(num_genes = 0, percentage = 0)) - - read_bins_data$bin <- factor(read_bins_data$bin, levels = gene_bin_levels) - read_bins_data$gene_type <- factor(read_bins_data$gene_type, levels = c("Annotated", "Novel")) - - if (mode == "isoforms") { - gg_read_bins <<- build_grouped_violin_plot( - df = read_bins_data %>% transmute(bin = as.character(bin), group = as.character(gene_type), value = percentage), - bin_levels = gene_bin_levels, - group_levels = c("Annotated", "Novel"), - title = paste("Distribution of Known/Novel Genes by", entity_label, "Count Bins Across Cells"), - fill_map = c("Annotated" = "#e37744", "Novel" = "#78C679"), - legend_labels = c("Annotated" = "Annotated", "Novel" = "Novel"), - y_label = "Genes, %", - ylim = c(0, 100), - violin_alpha = 0.5, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_width = 0.28, - dodge_width = 1.0 - ) - } - - # Combined (all genes together): one violin per bin - if (mode == "reads") { - # Filter for Annotated genes only - read_bins_all <- genes_by_cb %>% - filter(gene_type == "Annotated") %>% - group_by(CB, bin) %>% - summarise(num_genes = n(), .groups = "drop") %>% - group_by(CB) %>% - mutate(percentage = 100 * num_genes / sum(num_genes)) %>% - ungroup() %>% - tidyr::complete(CB, bin = gene_bin_levels, fill = list(num_genes = 0, percentage = 0)) - - plot_title_all <- paste("Distribution of Annotated Genes by", entity_label, "Count Bins Across Cells") - } else { - # All genes (Annotated + Novel) - read_bins_all <- genes_by_cb %>% - group_by(CB, bin) %>% - summarise(num_genes = n(), .groups = "drop") %>% - group_by(CB) %>% - mutate(percentage = 100 * num_genes / sum(num_genes)) %>% - ungroup() %>% - tidyr::complete(CB, bin = gene_bin_levels, fill = list(num_genes = 0, percentage = 0)) - - plot_title_all <- paste("Distribution of Genes by", entity_label, "Count Bins Across Cells") - } - - read_bins_all$bin <- factor(read_bins_all$bin, levels = gene_bin_levels) - - { - df_long <- data.frame( - Variable = factor(read_bins_all$bin, levels = gene_bin_levels), - Value = read_bins_all$percentage - ) - fill_map <- setNames(rep("#CC6633", length(gene_bin_levels)), gene_bin_levels) - gg_read_bins_all <<- build_violin_plot( - df_long, - title = plot_title_all, - x_labels = as.character(gene_bin_levels), - fill_map = fill_map, - y_label = "Genes, %", - legend = FALSE, - ylim = c(0, 100), - violin_alpha = 0.5, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - box_outline_default = "black", - violin_outline_fill = FALSE - ) - } - - # New plot: Distribution of Known Genes by Unique Isoform Count Bins Across Cells (Isoforms mode) - if (mode == "isoforms") { - iso_bins_annot <- genes_by_cb %>% - filter(gene_type == "Annotated") %>% - group_by(CB, bin) %>% - summarise(num_genes = n(), .groups = "drop") %>% - group_by(CB) %>% - mutate(percentage = 100 * num_genes / sum(num_genes)) %>% - ungroup() %>% - tidyr::complete(CB, bin = gene_bin_levels, fill = list(num_genes = 0, percentage = 0)) - - iso_bins_annot$bin <- factor(iso_bins_annot$bin, levels = gene_bin_levels) - - df_long_iso <- data.frame( - Variable = factor(iso_bins_annot$bin, levels = gene_bin_levels), - Value = iso_bins_annot$percentage - ) - # Use Annotated color #e37744 - fill_map_iso <- setNames(rep("#e37744", length(gene_bin_levels)), gene_bin_levels) - - gg_isoform_bins <<- build_violin_plot( - df_long_iso, - title = "Distribution of Known Genes by Unique Isoform Count Bins Across Cells", - x_labels = as.character(gene_bin_levels), - fill_map = fill_map_iso, - y_label = "Genes, %", - legend = FALSE, - ylim = c(0, 100), - violin_alpha = 0.5, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - box_outline_default = "black", - violin_outline_fill = FALSE - ) - } - - # UJC bins (combined) using jxn strings per gene per CB - if (mode != "isoforms") { - ujc_bin_label <- function(n) { - if (is.na(n)) { - return(NA_character_) - } - if (n == 1) { - return("1") - } - if (n >= 2 && n <= 3) { - return("2-3") - } - if (n >= 4 && n <= 5) { - return("4-5") - } - return(">=6") - } - ujc_bin_levels <- c("1", "2-3", "4-5", ">=6") - - # Check if jxn_string exists (it won't if --skip_hash was used) - if ("jxn_string" %in% colnames(Classification_file)) { - ujc_by_cb <- Classification_file %>% - filter(!is.na(CB), CB != "unassigned", !is.na(associated_gene), exons > 1) %>% - group_by(CB, associated_gene) %>% - summarise(ujc_per_gene = dplyr::n_distinct(jxn_string), .groups = "drop") %>% - mutate(bin = vapply(ujc_per_gene, ujc_bin_label, character(1))) %>% - filter(!is.na(bin)) - } else { - ujc_by_cb <- data.frame() - } - - if (nrow(ujc_by_cb) > 0) { - # For reads mode, filter for Annotated genes only - if (mode == "reads") { - ujc_by_cb <- ujc_by_cb %>% - mutate(gene_type = ifelse(grepl("^novel", associated_gene), "Novel", "Annotated")) %>% - filter(gene_type == "Annotated") - - plot_title_ujc_all <- "Distribution of Annotated Genes by UJC Count Bins Across Cells" - } else { - plot_title_ujc_all <- "Distribution of Genes by UJC Count Bins Across Cells" - } - - ujc_bins_all <- ujc_by_cb %>% - group_by(CB, bin) %>% - summarise(num_genes = n(), .groups = "drop") %>% - group_by(CB) %>% - mutate(percentage = 100 * num_genes / sum(num_genes)) %>% - ungroup() %>% - tidyr::complete(CB, bin = ujc_bin_levels, fill = list(num_genes = 0, percentage = 0)) - - ujc_bins_all$bin <- factor(ujc_bins_all$bin, levels = ujc_bin_levels) - - { - df_long <- data.frame( - Variable = factor(ujc_bins_all$bin, levels = ujc_bin_levels), - Value = ujc_bins_all$percentage - ) - } - fill_map <- setNames(rep("#CC6633", length(ujc_bin_levels)), ujc_bin_levels) - gg_ujc_bins_all <<- build_violin_plot( - df_long, - title = plot_title_ujc_all, - x_labels = as.character(ujc_bin_levels), - fill_map = fill_map, - y_label = "Genes, %", - legend = FALSE, - ylim = c(0, 100), - violin_alpha = 0.5, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - box_outline_default = "black", - violin_outline_fill = FALSE - ) - } - - # Create UJC bins data - ujc_bins_data <- data.frame( - CB = rep(SQANTI_cell_summary$CB, 8), - bin = rep(c("1", "2-3", "4-5", ">=6", "1", "2-3", "4-5", ">=6"), each = nrow(SQANTI_cell_summary)), - gene_type = rep(c("Annotated", "Annotated", "Annotated", "Annotated", "Novel", "Novel", "Novel", "Novel"), each = nrow(SQANTI_cell_summary)), - percentage = c( - SQANTI_cell_summary$anno_ujc_bin1_perc, - SQANTI_cell_summary$anno_ujc_bin2_3_perc, - SQANTI_cell_summary$anno_ujc_bin4_5_perc, - SQANTI_cell_summary$anno_ujc_bin6plus_perc, - SQANTI_cell_summary$novel_ujc_bin1_perc, - SQANTI_cell_summary$novel_ujc_bin2_3_perc, - SQANTI_cell_summary$novel_ujc_bin4_5_perc, - SQANTI_cell_summary$novel_ujc_bin6plus_perc - ) - ) - - # Handle NA and invalid values - ujc_bins_data <- ujc_bins_data %>% - mutate(percentage = ifelse(is.na(percentage) | is.infinite(percentage) | percentage < 0, 0, percentage)) - - ujc_bins_data$bin <- factor(ujc_bins_data$bin, levels = c("1", "2-3", "4-5", ">=6")) - ujc_bins_data$gene_type <- factor(ujc_bins_data$gene_type, levels = c("Annotated", "Novel")) - - # Only generate split plot if NOT in reads mode - if (mode != "reads") { - gg_ujc_bins <<- build_grouped_violin_plot( - df = ujc_bins_data %>% transmute(bin = as.character(bin), group = as.character(gene_type), value = percentage), - bin_levels = ujc_bin_levels, - group_levels = c("Annotated", "Novel"), - title = "Distribution of Known/Novel Genes by UJC Count Bins Across Cells", - fill_map = c("Annotated" = "#e37744", "Novel" = "#78C679"), - legend_labels = c("Annotated" = "Annotated", "Novel" = "Novel"), - y_label = "Genes, %", - ylim = c(0, 100), - violin_alpha = 0.5, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_width = 0.28, - dodge_width = 1.0 - ) - } - } - - # Mitochondrial percentage in cell - { - df_long <- data.frame(Variable = "MT_perc", Value = SQANTI_cell_summary$MT_perc) - df_long$Variable <- factor(df_long$Variable, levels = "MT_perc") - fill_map = c("MT_perc" = fill_color_orange) - x_labels <- c("Cell") - gg_MT_perc <<- build_violin_plot( - df_long, - title = paste("Mitochondrial", entity_label_plural, "\nAcross Cells"), - x_labels = x_labels, - fill_map = fill_map, - y_label = paste(entity_label_plural, ", %", sep = ""), - legend = FALSE, - violin_alpha = 0.5, - box_alpha = 3, - box_width = 0.05, - x_tickangle = 45 - ) - } - - # Mono/multi-exon prop novel vs annotated genes - - ### Length distribution ### - ########################### - - # Compact helpers for repeated per-category and length plots - cat_tags <- c("FSM", "ISM", "NIC", "NNC", "Genic", "Antisense", "Fusion", "Intergenic", "Genic_intron") - cat_labels_pretty <- c("FSM", "ISM", "NIC", "NNC", "Genic\nGenomic", "Antisense", "Fusion", "Intergenic", "Genic\nIntron") - cat_fill_map <- c(FSM = "#6BAED6", ISM = "#FC8D59", NIC = "#78C679", NNC = "#EE6A50", Genic = "#969696", Antisense = "#66C2A4", Fusion = "goldenrod1", Intergenic = "darksalmon", Genic_intron = "#41B6C4") - structural_category_map <- c( - "full-splice_match" = "FSM", - "incomplete-splice_match" = "ISM", - "novel_in_catalog" = "NIC", - "novel_not_in_catalog" = "NNC", - "genic" = "Genic", - "antisense" = "Antisense", - "fusion" = "Fusion", - "intergenic" = "Intergenic", - "genic_intron" = "Genic_intron" - ) - structural_category_levels <- unname(structural_category_map) - # Build violin across categories and assign to a global name - # Helper: build 9 tag column names from suffix (e.g. "_intrapriming_prop") - cat_cols <- function(suffix) paste0(cat_tags, suffix) - # Length plot generator and variable name mapping - cat_var_base <- c(FSM = "FSM", ISM = "ISM", NIC = "NIC", NNC = "NNC", Genic = "genic", Antisense = "antisense", Fusion = "fusion", Intergenic = "intergenic", Genic_intron = "genic_intron") - make_len_plot <- function(prefix, pretty, color, mono = FALSE) { - var_nm <- if (mono) paste0("gg_", cat_var_base[[prefix]], "_mono_read_distr") else paste0("gg_", cat_var_base[[prefix]], "_read_distr") - title_txt <- if (mono) paste0(pretty, " Mono-exonic Read Lengths Distribution Across Cells") else paste0(pretty, " Reads Length Distribution Across Cells") - assign(var_nm, build_len_violin_for_prefix( - SQANTI_cell_summary, - prefix = prefix, - title = title_txt, - fill_color = color, - box_fill = color, - mono = mono, - violin_alpha = 0.7, - box_alpha = 0.3, - box_outline_color = if (prefix %in% c("Genic")) "grey90" else "grey20", - violin_outline_fill = TRUE - ), envir = .GlobalEnv) - } - - # Bulk distributions - gg_bulk_all_reads <<- ggplot(Classification_file, aes(x = length)) + - geom_histogram(binwidth = 50, fill = "#CC6633", color = "black", alpha = 0.5) + - labs( - title = paste("All", entity_label, "Lengths Distribution"), - x = paste(entity_label, "length"), - y = paste(entity_label_plural, ", counts", sep = "") - ) + - theme_classic() + - theme( - legend.position = "none", - plot.title = element_text(size = 14, face = "bold", hjust = 0.5), - plot.margin = margin(t = 40, r = 5, b = 5, l = 5, unit = "pt"), - axis.title = element_text(size = 16), - axis.text.y = element_text(size = 14), - axis.text.x = element_text(size = 16) - ) - - # Bulk read length distribution by structural category - Classification_file$structural_category <- factor( - Classification_file$structural_category, - levels = c( - "full-splice_match", - "incomplete-splice_match", - "novel_in_catalog", - "novel_not_in_catalog", - "genic", - "antisense", - "fusion", - "intergenic", - "genic_intron" - ) - ) - - structural_category_labels <- c( - "full-splice_match" = "FSM", - "incomplete-splice_match" = "ISM", - "novel_in_catalog" = "NIC", - "novel_not_in_catalog" = "NNC", - "genic" = "Genic Genomic", - "antisense" = "Antisense", - "fusion" = "Fusion", - "intergenic" = "Intergenic", - "genic_intron" = "Genic Intron" - ) - structural_category_palette <- c( - "FSM" = "#6BAED6", - "ISM" = "#FC8D59", - "NIC" = "#78C679", - "NNC" = "#EE6A50", - "Genic Genomic" = "#969696", - "Antisense" = "#66C2A4", - "Fusion" = "goldenrod1", - "Intergenic" = "darksalmon", - "Genic Intron" = "#41B6C4" - ) - Classification_file$structural_category_pretty <- structural_category_labels[as.character(Classification_file$structural_category)] - Classification_file$structural_category_pretty <- factor( - Classification_file$structural_category_pretty, - levels = names(structural_category_palette) - ) - - gg_bulk_length_by_category <<- ggplot(Classification_file, aes(x = length, color = structural_category_pretty)) + - geom_freqpoly(binwidth = 100, linewidth = 1.2, na.rm = TRUE) + - labs( - title = paste("All", entity_label, "Lengths Distribution by Structural Category"), - x = paste(entity_label, "length"), - y = paste(entity_label_plural, ", counts", sep = ""), - color = NULL - ) + - theme_classic(base_size = 16) + - scale_color_manual(values = structural_category_palette, drop = FALSE) + - scale_x_continuous( - breaks = scales::pretty_breaks(n = 8), - labels = scales::comma - ) + - theme( - legend.position = "bottom", - legend.title = element_blank(), - legend.key.size = unit(0.8, "cm"), - legend.text = element_text(size = 12), - plot.title = element_text(size = 14, face = "bold", hjust = 0.5), - plot.margin = margin(t = 40, r = 5, b = 5, l = 5, unit = "pt"), - axis.title = element_text(size = 16), - axis.text.y = element_text(size = 14), - axis.text.x = element_text(size = 16) - ) + - guides(color = guide_legend(nrow = 2)) - - # Mono vs multi-exon classification for length - Classification_file$exons <- as.numeric(Classification_file$exons) - - gg_bulk_length_by_exon_type <<- ggplot( - Classification_file, - aes(x = length, color = ifelse(exons == 1, "Mono-Exon", "Multi-Exon")) - ) + - geom_freqpoly(binwidth = 100, linewidth = 1.2, na.rm = TRUE) + - labs( - title = paste("Mono- vs Multi- Exon", entity_label, "Lengths Distribution"), - x = paste(entity_label, "length"), - y = paste(entity_label_plural, ", counts", sep = ""), - color = NULL - ) + - theme_classic(base_size = 16) + - scale_color_manual( - values = c("Multi-Exon" = "#3B0057", "Mono-Exon" = "#FFE44C") - ) + - scale_x_continuous( - breaks = scales::pretty_breaks(n = 8), - labels = scales::comma - ) + - theme( - legend.position = "bottom", - legend.title = element_blank(), - legend.key.size = unit(1, "cm"), - legend.text = element_text(size = 14), - plot.title = element_text(size = 14, face = "bold", hjust = 0.5), - plot.margin = margin(t = 40, r = 5, b = 5, l = 5, unit = "pt"), - axis.title = element_text(size = 16), - axis.text.y = element_text(size = 14), - axis.text.x = element_text(size = 16) - ) - - # Cell-level length distributions (all + mono) - gg_read_distr <<- build_len_violin_for_prefix( - SQANTI_cell_summary, - prefix = "Total", - title = paste(entity_label_plural, "Length Distribution Across Cells"), - fill_color = "#CC6633", - box_fill = "#CC6633", - mono = FALSE, - violin_alpha = 0.7, - box_alpha = 0.6, - box_outline_color = "grey20", - violin_outline_fill = FALSE - ) - - # Mono-exon length distribution per break - gg_read_distr_mono <<- build_len_violin_for_prefix( - SQANTI_cell_summary, - prefix = "Total", - title = paste("Mono-exonic", entity_label_plural, "Length Distribution Across Cells"), - fill_color = "#CC6633", - box_fill = "#CC6633", - mono = TRUE, - violin_alpha = 0.7, - box_alpha = 0.6, - box_outline_color = "grey20", - violin_outline_fill = FALSE - ) - - # Per-category length distributions via loop - len_specs <- list( - list(tag = "FSM", pretty = "FSM", color = "#6BAED6"), - list(tag = "ISM", pretty = "ISM", color = "#FC8D59"), - list(tag = "NIC", pretty = "NIC", color = "#78C679"), - list(tag = "NNC", pretty = "NNC", color = "#EE6A50"), - list(tag = "Genic", pretty = "Genic", color = "#969696"), - list(tag = "Antisense", pretty = "Antisense", color = "#66C2A4"), - list(tag = "Fusion", pretty = "Fusion", color = "goldenrod1"), - list(tag = "Intergenic", pretty = "Intergenic", color = "darksalmon"), - list(tag = "Genic_intron", pretty = "Genic Intron", color = "#41B6C4") - ) - for (sp in len_specs) { - make_len_plot(sp$tag, sp$pretty, sp$color, mono = FALSE) - } - for (sp in len_specs) { - # Mono versions where meaningful (skip NNC and Fusion for PDF) - if (sp$tag %in% c("NNC", "Fusion")) next - make_len_plot(sp$tag, sp$pretty, sp$color, mono = TRUE) - } - - ### Reference coverage across categories ### - ############################################ - - { - cols <- c( - "FSM_ref_coverage_prop", "ISM_ref_coverage_prop", "NIC_ref_coverage_prop", "NNC_ref_coverage_prop", - "Genic_ref_coverage_prop", "Antisense_ref_coverage_prop", "Fusion_ref_coverage_prop", "Intergenic_ref_coverage_prop", - "Genic_intron_ref_coverage_prop" - ) - gg_SQANTI_pivot <- pivot_long(SQANTI_cell_summary, cols) - fill_map <- setNames(unname(cat_fill_map), cols) - x_labels <- cat_labels_pretty - # Build dynamic title using cutoff from cell summary - ref_cov_min_pct <- if ("ref_cov_min_pct" %in% colnames(SQANTI_cell_summary)) { - vals <- unique(stats::na.omit(SQANTI_cell_summary$ref_cov_min_pct)) - if (length(vals) > 0) as.numeric(vals[1]) else NA_real_ - } else { - NA_real_ - } - pct_lbl <- if (is.finite(ref_cov_min_pct)) { - if (abs(ref_cov_min_pct - round(ref_cov_min_pct)) < 1e-6) sprintf("%.0f", ref_cov_min_pct) else sprintf("%.1f", ref_cov_min_pct) - } else { - NULL - } - title_txt <- if (!is.null(pct_lbl)) { - paste0(entity_label_plural, " with Coverage >=", pct_lbl, "% of the Reference Transcript Length\nby Structural Category Across Cells") - } else { - "Reference Transcript Length Coverage\nby Structural Category Across Cells" - } - gg_ref_coverage_across_category <<- build_violin_plot( - gg_SQANTI_pivot, - title = title_txt, - x_labels = x_labels, - fill_map = fill_map, - y_label = paste(entity_label_plural, ", %", sep = ""), - legend = FALSE, - override_outline_vars = c("Genic_ref_coverage_prop", "Genic_intron_ref_coverage_prop"), - violin_outline_fill = TRUE - ) - } - - - ### Structural categories ### - - - category_fill_map <- c( - "FSM_prop" = "#6BAED6", "ISM_prop" = "#FC8D59", "NIC_prop" = "#78C679", "NNC_prop" = "#EE6A50", - "Genic_Genomic_prop" = "#969696", "Antisense_prop" = "#66C2A4", "Fusion_prop" = "goldenrod1", - "Intergenic_prop" = "darksalmon", "Genic_intron_prop" = "#41B6C4" - ) - pivot_violin(SQANTI_cell_summary, list( - name = "gg_SQANTI_across_category", - columns = names(category_fill_map), - title = "Structural Categories Distribution Across Cells", - x_labels = cat_labels_pretty, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = category_fill_map, - plot_args = list(override_outline_vars = c("Genic_Genomic_prop"), violin_outline_fill = TRUE) - )) - - # Coding/non-coding across structural categories (change it in the future to a combine plot) - if (!skipORF) { - # Update to new column naming convention: {tag}_coding_prop - # Explicitly define columns to match cell_metrics.py output (lowercase for non-canonical categories) - coding_cols <- c( - "FSM_coding_prop", "ISM_coding_prop", "NIC_coding_prop", "NNC_coding_prop", - "genic_coding_prop", "antisense_coding_prop", "fusion_coding_prop", - "intergenic_coding_prop", "genic_intron_coding_prop" - ) - coding_fill_map <- c( - "FSM_coding_prop" = "#6BAED6", - "ISM_coding_prop" = "#FC8D59", - "NIC_coding_prop" = "#78C679", - "NNC_coding_prop" = "#EE6A50", - "genic_coding_prop" = "#969696", - "antisense_coding_prop" = "#66C2A4", - "fusion_coding_prop" = "goldenrod1", - "intergenic_coding_prop" = "darksalmon", - "genic_intron_coding_prop" = "#41B6C4" - ) - - pivot_violin(SQANTI_cell_summary, list( - name = "gg_coding_across_category", - columns = names(coding_fill_map), - title = "Coding Proportion of Structural Categories Distribution Across Cells", - x_labels = cat_labels_pretty, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = coding_fill_map, - plot_args = list(override_outline_vars = c("genic_coding_prop"), violin_outline_fill = TRUE) - )) - - - - # Define colors for non-coding (same as coding but will use alpha) - noncoding_fill_map <- c( - "FSM_non_coding_prop" = "#6BAED6", - "ISM_non_coding_prop" = "#FC8D59", - "NIC_non_coding_prop" = "#78C679", - "NNC_non_coding_prop" = "#EE6A50", - "genic_non_coding_prop" = "#969696", - "antisense_non_coding_prop" = "#66C2A4", - "fusion_non_coding_prop" = "goldenrod1", - "intergenic_non_coding_prop" = "darksalmon", - "genic_intron_non_coding_prop" = "#41B6C4" - ) - - pivot_violin(SQANTI_cell_summary, list( - name = "gg_non_coding_across_category", - columns = names(noncoding_fill_map), - title = "Non-coding Proportion of Structural Categories Distribution Across Cells", - x_labels = cat_labels_pretty, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = noncoding_fill_map, - plot_args = list( - override_outline_vars = c(), - violin_outline_fill = TRUE, - violin_alpha = 0.4, - box_alpha = 0.1 - ) - )) - } # End of if (!skipORF) - - subcategory_configs <- list( - list( - name = "gg_SQANTI_across_FSM", - columns = c( - "FSM_alternative_3end_prop", "FSM_alternative_3end5end_prop", "FSM_alternative_5end_prop", - "FSM_reference_match_prop", "FSM_mono_exon_prop" - ), - title = "FSM Structural Subcategories Distribution Across Cells", - x_labels = c("Alternative 3'end", "Alternative 3'5'end", "Alternative 5'end", "Reference match", "Mono-exon"), - fill_map = c( - "FSM_alternative_3end_prop" = "#02314d", "FSM_alternative_3end5end_prop" = "#0e5a87", - "FSM_alternative_5end_prop" = "#7ccdfc", "FSM_reference_match_prop" = "#c4e1f2", - "FSM_mono_exon_prop" = "#cec2d2" - ), - plot_args = list(override_outline_vars = c("FSM_alternative_3end_prop", "FSM_alternative_3end5end_prop"), violin_outline_fill = TRUE) - ), - list( - name = "gg_SQANTI_across_ISM", - columns = c( - "ISM_3prime_fragment_prop", "ISM_internal_fragment_prop", "ISM_5prime_fragment_prop", - "ISM_intron_retention_prop", "ISM_mono_exon_prop" - ), - title = "ISM Structural Subcategories Distribution Across Cells", - x_labels = c("3' fragment", "Internal fragment", "5' fragment", "Intron retention", "Mono-exon"), - fill_map = c( - "ISM_3prime_fragment_prop" = "#c4531d", "ISM_internal_fragment_prop" = "#e37744", - "ISM_5prime_fragment_prop" = "#e0936e", "ISM_intron_retention_prop" = "#81eb82", - "ISM_mono_exon_prop" = "#cec2d2" - ), - plot_args = list(override_outline_vars = c("ISM_3prime_fragment_prop", "ISM_internal_fragment_prop"), violin_outline_fill = TRUE) - ), - list( - name = "gg_SQANTI_across_NIC", - columns = c( - "NIC_combination_of_known_junctions_prop", "NIC_combination_of_known_splicesites_prop", - "NIC_intron_retention_prop", "NIC_mono_exon_by_intron_retention_prop", "NIC_mono_exon_prop" - ), - title = "NIC Structural Subcategories Distribution Across Cells", - x_labels = c("Comb. of annot. junctions", "Comb. of annot. splice sites", "Intron retention", "Mono-exon by intron ret.", "Mono-exon"), - fill_map = c( - "NIC_combination_of_known_junctions_prop" = "#014d02", "NIC_combination_of_known_splicesites_prop" = "#379637", - "NIC_intron_retention_prop" = "#81eb82", "NIC_mono_exon_by_intron_retention_prop" = "#4aaa72", - "NIC_mono_exon_prop" = "#cec2d2" - ), - plot_args = list( - override_outline_vars = c( - "NIC_combination_of_known_junctions_prop", "NIC_combination_of_known_splicesites_prop", - "NIC_mono_exon_by_intron_retention_prop", "NIC_mono_exon_prop" - ), - violin_outline_fill = TRUE - ) - ), - list( - name = "gg_SQANTI_across_NNC", - columns = c("NNC_at_least_one_novel_splicesite_prop", "NNC_intron_retention_prop"), - title = "NNC Structural Subcategories Distribution Across Cells", - x_labels = c("At least\n1 annot. don./accept.", "Intron retention"), - fill_map = c("NNC_at_least_one_novel_splicesite_prop" = "#32734d", "NNC_intron_retention_prop" = "#81eb82"), - plot_args = list(override_outline_vars = c("NNC_at_least_one_novel_splicesite_prop"), violin_outline_fill = TRUE) - ), - list( - name = "gg_SQANTI_across_Fusion", - columns = c("Fusion_intron_retention_prop", "Fusion_multi_exon_prop"), - title = "Fusion Structural Subcategories Distribution Across Cells", - x_labels = c("Intron retention", "Multi-exon"), - fill_map = c("Fusion_intron_retention_prop" = "#81eb82", "Fusion_multi_exon_prop" = "#876a91"), - plot_args = list(override_outline_vars = c("Fusion_multi_exon_prop"), violin_outline_fill = TRUE) - ), - list( - name = "gg_SQANTI_across_Genic", - columns = c("Genic_mono_exon_prop", "Genic_multi_exon_prop"), - title = "Genic Structural Subcategories Distribution Across Cells", - x_labels = c("Mono-exon", "Multi-exon"), - fill_map = c("Genic_mono_exon_prop" = "#81eb82", "Genic_multi_exon_prop" = "#876a91"), - plot_args = list(override_outline_vars = c("Genic_multi_exon_prop"), violin_outline_fill = TRUE) - ), - list( - name = "gg_SQANTI_across_Genic_Intron", - columns = c("Genic_intron_mono_exon_prop", "Genic_intron_multi_exon_prop"), - title = "Genic Intron Structural Subcategories Distribution Across Cells", - x_labels = c("Mono-exon", "Multi-exon"), - fill_map = c("Genic_intron_mono_exon_prop" = "#81eb82", "Genic_intron_multi_exon_prop" = "#876a91"), - plot_args = list(override_outline_vars = c("Genic_intron_multi_exon_prop"), violin_outline_fill = TRUE) - ), - list( - name = "gg_SQANTI_across_Antisense", - columns = c("Antisense_mono_exon_prop", "Antisense_multi_exon_prop"), - title = "Antisense Structural Subcategories Distribution Across Cells", - x_labels = c("Mono-exon", "Multi-exon"), - fill_map = c("Antisense_mono_exon_prop" = "#81eb82", "Antisense_multi_exon_prop" = "#876a91"), - plot_args = list(override_outline_vars = c("Antisense_multi_exon_prop"), violin_outline_fill = TRUE) - ), - list( - name = "gg_SQANTI_across_Intergenic", - columns = c("Intergenic_mono_exon_prop", "Intergenic_multi_exon_prop"), - title = "Intergenic Structural Subcategories Distribution Across Cells", - x_labels = c("Mono-exon", "Multi-exon"), - fill_map = c("Intergenic_mono_exon_prop" = "#81eb82", "Intergenic_multi_exon_prop" = "#876a91"), - plot_args = list(override_outline_vars = c("Intergenic_multi_exon_prop"), violin_outline_fill = TRUE) - ) - ) - invisible(lapply(subcategory_configs, function(cfg) pivot_violin(SQANTI_cell_summary, cfg))) - - ### Splice junctions characterization ### - ######################################### - - pivot_violin(SQANTI_cell_summary, list( - name = "gg_known_novel_canon", - columns = c("Known_canonical_junctions_prop", "Known_non_canonical_junctions_prop", "Novel_canonical_junctions_prop", "Novel_non_canonical_junctions_prop"), - title = "Splice Junctions Distribution Across Cells", - x_labels = c("Known\nCanonical", "Known\nNon-canonical", "Novel\nCanonical", "Novel\nNon-canonical"), - y_label = "Junctions, %", - fill_map = c( - "Known_canonical_junctions_prop" = "#6BAED6", - "Known_non_canonical_junctions_prop" = "goldenrod1", - "Novel_canonical_junctions_prop" = "#78C679", - "Novel_non_canonical_junctions_prop" = "#FC8D59" - ), - plot_args = list(violin_outline_fill = TRUE) - )) - ### Good features plots (SR & TSS Validation) ### - ################################################# - - # 1. Combined Good Features Plot (All Transcripts) - all_good_features_map <- list( - "srjunctions_support_prop" = list(label = "SJs Validated by SRs", color = "#cd4f39"), - "TSS_ratio_validated_prop" = list(label = "TSS Validated by SRs", color = "#FFC125") - # Add other good features here if needed (e.g. polyA_motif_found_prop if available) - ) - - # Determine which good feature columns are present - good_feature_cols_present <- intersect(names(all_good_features_map), colnames(SQANTI_cell_summary)) - good_feature_cols_present <- good_feature_cols_present[sapply(good_feature_cols_present, function(col) any(!is.na(SQANTI_cell_summary[[col]])) && sum(SQANTI_cell_summary[[col]], na.rm = TRUE) > 0)] - - if (length(good_feature_cols_present) > 0) { - current_good_colors <- sapply(all_good_features_map[good_feature_cols_present], function(x) x$color) - current_good_labels <- sapply(all_good_features_map[good_feature_cols_present], function(x) x$label) - names(current_good_colors) <- good_feature_cols_present - names(current_good_labels) <- good_feature_cols_present - - pivot_violin(SQANTI_cell_summary, list( - name = "gg_good_feature", - columns = good_feature_cols_present, - title = "Validation Features Distribution Across Cells", - x_labels = current_good_labels, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = current_good_colors, - plot_args = list(violin_outline_fill = TRUE) - )) - } - - # 2. Per-Category Plots - # Short Read (SJs) Support - sr_cat_cols <- cat_cols("_srjunctions_support_prop") - if (all(sr_cat_cols %in% colnames(SQANTI_cell_summary)) && any(colSums(SQANTI_cell_summary[, sr_cat_cols, drop = FALSE], na.rm = TRUE) > 0)) { - pivot_violin(SQANTI_cell_summary, list( - name = "gg_sr_support_by_category", - columns = sr_cat_cols, - title = "SJs Validated by Short Reads by Structural Category", - x_labels = cat_labels_pretty, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = setNames(rep("#cd4f39", length(sr_cat_cols)), sr_cat_cols), - plot_args = list(violin_outline_fill = TRUE) - )) - } - - # TSS Validation Support - tss_cat_cols <- cat_cols("_TSS_ratio_validated_prop") - if (all(tss_cat_cols %in% colnames(SQANTI_cell_summary)) && any(colSums(SQANTI_cell_summary[, tss_cat_cols, drop = FALSE], na.rm = TRUE) > 0)) { - pivot_violin(SQANTI_cell_summary, list( - name = "gg_tss_validation_by_category", - columns = tss_cat_cols, - title = "TSS Validated by Short Reads by Structural Category", - x_labels = cat_labels_pretty, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = setNames(rep("#FFC125", length(tss_cat_cols)), tss_cat_cols), - plot_args = list(violin_outline_fill = TRUE) - )) - } - ### Bad features plots ### - ########################## - - bad_specs <- list( - list(suffix = "_intrapriming_prop", title = "Intrapriming by Structural Category", color = "#78C679", name = "gg_intrapriming_by_category"), - list(suffix = "_RTS_prop", title = "RT-switching by Structural Category", color = "#FF9933", name = "gg_RTS_by_category"), - list(suffix = "_noncanon_prop", title = "Non-Canonical Junctions by Structural Category", color = "#41B6C4", name = "gg_noncanon_by_category") - ) - invisible(lapply(bad_specs, function(sp) { - cols <- cat_cols(sp$suffix) - pivot_violin(SQANTI_cell_summary, list( - name = sp$name, - columns = cols, - title = sp$title, - x_labels = cat_labels_pretty, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = setNames(rep(sp$color, length(cols)), cols), - plot_args = list(violin_outline_fill = TRUE) - )) - })) - - # NMD (split between categories) - nmd_cols <- c("FSM_NMD_prop", "ISM_NMD_prop", "NIC_NMD_prop", "NNC_NMD_prop", "Genic_NMD_prop", "Antisense_NMD_prop", "Fusion_NMD_prop", "Intergenic_NMD_prop", "Genic_intron_NMD_prop") - if (all(nmd_cols %in% colnames(SQANTI_cell_summary))) { - pivot_violin(SQANTI_cell_summary, list( - name = "gg_NMD_by_category", - columns = nmd_cols, - title = "Nonsense-Mediated Decay by Structural Category", - x_labels = cat_labels_pretty, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = setNames(rep("#969696", length(nmd_cols)), nmd_cols), - plot_args = list(violin_outline_fill = TRUE) - )) - } - - ## Bad quality features combined figure - # Define all possible features, their colors, and labels - all_bad_features_map <- list( - "Intrapriming_prop_in_cell" = list(label = "Intrapriming", color = "#78C679"), - "RTS_prop_in_cell" = list(label = "RT-switching", color = "#FF9933"), - "Non_canonical_prop_in_cell" = list(label = "Non-Canonical Junctions", color = "#41B6C4"), - "NMD_prop_in_cell" = list(label = "Predicted NMD", color = "#969696") - ) - - # Determine which bad feature columns are actually present in SQANTI_cell_summary - # This implicitly handles skipORF, as NMD_prop_in_cell won't be in SQANTI_cell_summary if skipORF is TRUE - bad_feature_cols_present <- intersect(names(all_bad_features_map), colnames(SQANTI_cell_summary)) - bad_feature_cols_present <- bad_feature_cols_present[sapply(bad_feature_cols_present, function(col) any(!is.na(SQANTI_cell_summary[[col]])) && sum(SQANTI_cell_summary[[col]], na.rm = TRUE) > 0)] # keep only if data exists - - # Order them as originally intended, if present - ordered_bad_feature_cols <- c("Intrapriming_prop_in_cell", "RTS_prop_in_cell", "Non_canonical_prop_in_cell", "NMD_prop_in_cell") - bad_feature_cols_present <- intersect(ordered_bad_feature_cols, bad_feature_cols_present) - - - if (length(bad_feature_cols_present) > 0) { - current_colors <- sapply(all_bad_features_map[bad_feature_cols_present], function(x) x$color) - current_labels <- sapply(all_bad_features_map[bad_feature_cols_present], function(x) x$label) - # Ensure names are correctly assigned for scales, matching the order in bad_feature_cols_present - names(current_colors) <- bad_feature_cols_present - names(current_labels) <- bad_feature_cols_present - - pivot_violin(SQANTI_cell_summary, list( - name = "gg_bad_feature", - columns = bad_feature_cols_present, - title = "Bad Quality Control Attributes Across Cells", - x_labels = current_labels, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = current_colors, - plot_args = list(violin_outline_fill = TRUE) - )) - } else { - gg_bad_feature <<- plot_ly() %>% - layout( - title = "No bad quality features to display", - annotations = list( - text = "No bad quality features to display", - showarrow = FALSE, - font = list(size = 18, color = "gray") - ) - ) - } - - ### Good features plots ### - ########################## - - good_specs <- list( - list(cols = cat_cols("_TSSAnnotationSupport"), title = "TSS Annotation Support by Structural Category", color = "#66C2A4", name = "gg_tss_annotation_support", require_all = TRUE), - list(cols = cat_cols("_CAGE_peak_support_prop"), title = "CAGE Peak Support by Structural Category", color = "#EE6A50", name = "gg_cage_peak_support", require_all = TRUE), - list(cols = cat_cols("_PolyA_motif_support_prop"), title = "PolyA Support by Structural Category", color = "#78C679", name = "gg_polyA_motif_support", require_all = TRUE), - list(cols = cat_cols("_canon_prop"), title = "Canonical Junctions by Structural Category", color = "#CC6633", name = "gg_canon_by_category", require_all = TRUE), - list(cols = cat_cols("_srjunctions_support_prop"), title = "Splice Junctions Support by Structural Category", color = "#cd4f39", name = "gg_sr_support_by_category", require_all = TRUE), - list(cols = cat_cols("_TSS_ratio_validated_prop"), title = "TSS Support by Structural Category", color = "#ffc125", name = "gg_tss_validation_by_category", require_all = TRUE) - ) - invisible(lapply(good_specs, function(sp) { - if (!is.null(sp$require_all) && sp$require_all && !all(sp$cols %in% colnames(SQANTI_cell_summary))) { - return(NULL) - } - # Check if data is not empty (all zeros) - if (all(sp$cols %in% colnames(SQANTI_cell_summary)) && all(colSums(SQANTI_cell_summary[, sp$cols, drop = FALSE], na.rm = TRUE) == 0)) { - return(NULL) - } - pivot_violin(SQANTI_cell_summary, list( - name = sp$name, - columns = sp$cols, - title = sp$title, - x_labels = cat_labels_pretty, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = setNames(rep(sp$color, length(sp$cols)), sp$cols), - plot_args = list(violin_outline_fill = TRUE) - )) - })) - - ## Good quality features combined figure - good_feature_cols <- c("TSSAnnotationSupport_prop") - - if ("CAGE_peak_support_prop" %in% colnames(SQANTI_cell_summary)) { - good_feature_cols <- c(good_feature_cols, "CAGE_peak_support_prop") - } - if ("PolyA_motif_support_prop" %in% colnames(SQANTI_cell_summary)) { - good_feature_cols <- c(good_feature_cols, "PolyA_motif_support_prop") - } - good_feature_cols <- c(good_feature_cols, "Canonical_prop_in_cell") - - if ("srjunctions_support_prop" %in% colnames(SQANTI_cell_summary) && sum(SQANTI_cell_summary$srjunctions_support_prop, na.rm = TRUE) > 0) { - good_feature_cols <- c(good_feature_cols, "srjunctions_support_prop") - } - if ("TSS_ratio_validated_prop" %in% colnames(SQANTI_cell_summary) && sum(SQANTI_cell_summary$TSS_ratio_validated_prop, na.rm = TRUE) > 0) { - good_feature_cols <- c(good_feature_cols, "TSS_ratio_validated_prop") - } - - color_map <- c( - "TSSAnnotationSupport_prop" = "#66C2A4", - "CAGE_peak_support_prop" = "#EE6A50", - "PolyA_motif_support_prop" = "#78C679", - "Canonical_prop_in_cell" = "#CC6633", - "srjunctions_support_prop" = "#cd4f39", - "TSS_ratio_validated_prop" = "#ffc125" - ) - label_map <- c( - "TSSAnnotationSupport_prop" = "TSS Annotated", - "CAGE_peak_support_prop" = "Has Coverage CAGE", - "PolyA_motif_support_prop" = "Has PolyA Motif", - "Canonical_prop_in_cell" = "Canonical Junctions", - "srjunctions_support_prop" = "SJs Support by SRs", - "TSS_ratio_validated_prop" = "TSS Support by SRs" - ) - color_map <- color_map[good_feature_cols] - label_map <- label_map[good_feature_cols] - - pivot_violin(SQANTI_cell_summary, list( - name = "gg_good_feature", - columns = good_feature_cols, - title = "Good Quality Control Attributes Across Cells", - x_labels = label_map, - y_label = paste(entity_label_plural, ", %", sep = ""), - fill_map = color_map, - plot_args = list(violin_outline_fill = TRUE) - )) - - ### Exon structure across cells by structural category ### - { - cat_key_map <- structural_category_map - all_cats <- structural_category_levels - x_labels_pretty <- cat_labels_pretty - - fill_map_cat <- cat_fill_map - - # 1) Median exons per read per cell and category - cls_valid <- Classification_file %>% - dplyr::filter(CB != "unassigned") %>% - mutate(cat_key = unname(cat_key_map[structural_category])) %>% - filter(!is.na(cat_key)) - exons_mean_by_cell <- cls_valid %>% - group_by(CB, cat_key) %>% - summarise(median_exons = median(as.numeric(exons), na.rm = TRUE), .groups = "drop") %>% - tidyr::complete(CB, cat_key = all_cats, fill = list(median_exons = NA_real_)) - - df_exon_mean_long <- data.frame( - Variable = factor(exons_mean_by_cell$cat_key, levels = all_cats), - Value = exons_mean_by_cell$median_exons - ) - - gg_exon_mean_by_category <<- build_violin_plot( - df_long = df_exon_mean_long, - title = paste("Median Exons per", entity_label, "by Structural Category Across Cells"), - x_labels = x_labels_pretty, - fill_map = fill_map_cat, - y_label = paste("Exons per", entity_label), - legend = FALSE, - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - override_outline_vars = c("Genic"), - adjust = 3 - ) - - # 2) Percent mono-exonic reads per cell and category - exons_bin_by_cell <- cls_valid %>% - mutate(is_mono = as.numeric(exons) == 1) %>% - group_by(CB, cat_key) %>% - summarise(total = dplyr::n(), mono = sum(is_mono, na.rm = TRUE), .groups = "drop") %>% - tidyr::complete(CB, cat_key = all_cats, fill = list(total = 0, mono = 0)) %>% - mutate(perc_mono = ifelse(total > 0, 100 * mono / total, 0)) - - df_exon_mono_long <- data.frame( - Variable = factor(exons_bin_by_cell$cat_key, levels = all_cats), - Value = exons_bin_by_cell$perc_mono - ) - - gg_exon_mono_by_category <<- build_violin_plot( - df_long = df_exon_mono_long, - title = paste("Mono-exonic", entity_label_plural, "by Structural Category Across Cells"), - x_labels = x_labels_pretty, - fill_map = fill_map_cat, - y_label = paste(entity_label_plural, ", %", sep = ""), - legend = FALSE, - ylim = c(0, 100), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - override_outline_vars = c("Genic") - ) - - # 3) Exon count bins per structural category across cells (HTML) - exon_bin_levels <- c("1", "2-3", "4-5", ">=6") - bin_fill_map <- setNames(c("#6BAED6", "#78C679", "#FC8D59", "#969696"), exon_bin_levels) - gg_exon_bins_by_category <<- list() - for (ck in all_cats) { - cat_df <- cls_valid %>% filter(cat_key == ck) - if (nrow(cat_df) == 0) { - next - } - bins_by_cell <- cat_df %>% - mutate( - exons_n = as.numeric(exons), - bin = dplyr::case_when( - exons_n <= 1 ~ "1", - exons_n <= 3 ~ "2-3", - exons_n <= 5 ~ "4-5", - TRUE ~ ">=6" - ) - ) %>% - group_by(CB, bin) %>% - summarise(n = dplyr::n(), .groups = "drop") %>% - group_by(CB) %>% - mutate(perc = 100 * n / sum(n)) %>% - ungroup() %>% - tidyr::complete(CB, bin = exon_bin_levels, fill = list(n = 0, perc = 0)) - - df_long_bins <- data.frame( - Variable = factor(bins_by_cell$bin, levels = exon_bin_levels), - Value = bins_by_cell$perc - ) - - pretty_name <- switch(ck, - FSM = "FSM", - ISM = "ISM", - NIC = "NIC", - NNC = "NNC", - Genic = "Genic Genomic", - Antisense = "Antisense", - Fusion = "Fusion", - Intergenic = "Intergenic", - Genic_intron = "Genic Intron", - ck - ) - - gg_exon_bins_by_category[[pretty_name]] <<- build_violin_plot( - df_long = df_long_bins, - title = paste0("Exon Count Bins in ", pretty_name, " Across Cells"), - x_labels = exon_bin_levels, - fill_map = bin_fill_map, - y_label = paste(entity_label_plural, ", %", sep = ""), - legend = FALSE, - ylim = c(0, 100), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20" - ) - } - - # 4) Exon count profile per category across cells (median + IQR) - K <- 20 - min_reads <- 5 - gg_exon_profile_by_category <<- list() - for (ck in all_cats) { - cat_df <- cls_valid %>% - mutate(cat_key = unname(cat_key_map[structural_category])) %>% - dplyr::filter(cat_key == ck) - if (nrow(cat_df) == 0) next - # Cells with at least min_reads in this category; if none, fallback to all cells with any reads - cells_ok <- cat_df %>% - group_by(CB) %>% - summarise(total = dplyr::n(), .groups = "drop") %>% - filter(total >= min_reads) %>% - pull(CB) - if (length(cells_ok) == 0) { - cells_ok <- unique(cat_df$CB) - } - cat_df2 <- cat_df %>% - filter(CB %in% cells_ok) %>% - mutate(exons_n = pmin(as.numeric(exons), K)) - # Per-cell PMF - pmf <- cat_df2 %>% - group_by(CB, exons_n) %>% - summarise(n = dplyr::n(), .groups = "drop") %>% - group_by(CB) %>% - mutate(perc = 100 * n / sum(n)) %>% - ungroup() %>% - tidyr::complete(CB, exons_n = seq_len(K), fill = list(n = 0, perc = 0)) - # Aggregate across cells - prof <- pmf %>% - group_by(exons_n) %>% - summarise( - mean = base::mean(perc, na.rm = TRUE), - median = stats::median(perc, na.rm = TRUE), - q1 = stats::quantile(perc, 0.25, na.rm = TRUE, type = 7), - q3 = stats::quantile(perc, 0.75, na.rm = TRUE, type = 7), - .groups = "drop" - ) %>% - rename(k = exons_n) - pretty_name <- switch(ck, - FSM = "FSM", - ISM = "ISM", - NIC = "NIC", - NNC = "NNC", - Genic = "Genic Genomic", - Antisense = "Antisense", - Fusion = "Fusion", - Intergenic = "Intergenic", - Genic_intron = "Genic Intron", - ck - ) - gg_exon_profile_by_category[[pretty_name]] <<- build_exon_profile_plot( - df_prof = prof, title = paste0("Exon Count Profile in ", pretty_name, " Across Cells"), - line_color = fill_map_cat[ck], k_max = K, y_label = paste(entity_label_plural, ", %", sep = ""), n_cells = length(unique(cells_ok)) - ) - } - } - - ### Presets ### - ############### - - # t1 <- ttheme_default(core=list(core = list(fg_params = list(cex = 0.6)), - # colhead = list(fg_params = list(cex = 0.7)))) - - # Build SJ per-type/per-category plots and the all-canonical grouped plot for HTML (and reuse for PDF) - # This block creates plot objects regardless of generate_pdf so the Rmd can render them. - { - # Junction type percentages by structural category across cells - junc_aug_html <- Junctions %>% - dplyr::filter(CB != "unassigned") %>% - mutate(junction_type = paste(junction_category, canonical, sep = "_")) - if (!("structural_category" %in% colnames(junc_aug_html))) { - join_key <- NULL - for (k in c("isoform", "readID", "read_id", "ID", "read_name", "read")) { - if (k %in% colnames(Junctions) && k %in% colnames(Classification_file)) { - join_key <- k - break - } - } - if (!is.null(join_key)) { - by_vec <- c(CB = "CB") - by_vec[[join_key]] <- join_key - junc_aug_html <- junc_aug_html %>% - left_join(Classification_file %>% select(all_of(c(join_key, "CB", "structural_category"))), by = by_vec) - } else { - junc_aug_html$structural_category <- NA_character_ - } - } - - cat_key_map <- structural_category_map - all_cats <- structural_category_levels - x_labels_full <- cat_labels_pretty - - junc_summ_html <- junc_aug_html %>% - filter(!is.na(structural_category)) %>% - mutate(cat_key = unname(cat_key_map[structural_category])) %>% - filter(!is.na(cat_key)) %>% - group_by(CB, cat_key) %>% - summarise( - total = n(), - known_canonical = sum(junction_type == "known_canonical", na.rm = TRUE), - known_non_canonical = sum(junction_type == "known_non_canonical", na.rm = TRUE), - novel_canonical = sum(junction_type == "novel_canonical", na.rm = TRUE), - novel_non_canonical = sum(junction_type == "novel_non_canonical", na.rm = TRUE), - .groups = "drop" - ) %>% - tidyr::complete(CB, cat_key = all_cats, fill = list(total = 0, known_canonical = 0, known_non_canonical = 0, novel_canonical = 0, novel_non_canonical = 0)) %>% - mutate( - KnownCanonicalPerc = ifelse(total > 0, 100 * known_canonical / total, 0), - KnownNonCanonicalPerc = ifelse(total > 0, 100 * known_non_canonical / total, 0), - NovelCanonicalPerc = ifelse(total > 0, 100 * novel_canonical / total, 0), - NovelNonCanonicalPerc = ifelse(total > 0, 100 * novel_non_canonical / total, 0) - ) %>% - ungroup() - - make_df_long_html <- function(col_name) { - data.frame(Variable = factor(junc_summ_html$cat_key, levels = all_cats), Value = junc_summ_html[[col_name]]) - } - - fill_map_cat <- cat_fill_map - - # Create plotly versions for HTML - gg_known_canon_by_category <<- build_violin_plot( - df_long = make_df_long_html("KnownCanonicalPerc"), - title = "", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Known Canonical Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100) - ) - - gg_known_noncanon_by_category <<- build_violin_plot( - df_long = make_df_long_html("KnownNonCanonicalPerc"), - title = "", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Known Non-canonical Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100) - ) - - gg_novel_canon_by_category <<- build_violin_plot( - df_long = make_df_long_html("NovelCanonicalPerc"), - title = "", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Novel Canonical Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100) - ) - - gg_novel_noncanon_by_category <<- build_violin_plot( - df_long = make_df_long_html("NovelNonCanonicalPerc"), - title = "", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Novel Non-canonical Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100) - ) - - # Stack the four SJ type-by-category plots into one figure - tick_angle_plotly <- 45 - - - - # Create ggplot versions for static stacking - p_known_canon_by_category <- build_violin_plot( - df_long = make_df_long_html("KnownCanonicalPerc"), - title = "", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Known Canonical Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100), - format = "ggplot" - ) - - p_known_noncanon_by_category <- build_violin_plot( - df_long = make_df_long_html("KnownNonCanonicalPerc"), - title = "", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Known Non-canonical Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100), - format = "ggplot" - ) - - p_novel_canon_by_category <- build_violin_plot( - df_long = make_df_long_html("NovelCanonicalPerc"), - title = "", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Novel Canonical Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100), - format = "ggplot" - ) - - p_novel_noncanon_by_category <- build_violin_plot( - df_long = make_df_long_html("NovelNonCanonicalPerc"), - title = "", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Novel Non-canonical Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100), - format = "ggplot" - ) - - # Stack the four SJ type-by-category plots into one static figure using gridExtra - # Remove x-axis labels/titles for top 3 plots to mimic shared axis - p1 <- p_known_canon_by_category + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), axis.ticks.x = element_blank()) - p2 <- p_known_noncanon_by_category + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), axis.ticks.x = element_blank()) - p3 <- p_novel_canon_by_category + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), axis.ticks.x = element_blank()) - p4 <- p_novel_noncanon_by_category # Keep x-axis for bottom plot - - # Create the static stack - gg_sj_type_by_category_stack <<- gridExtra::arrangeGrob( - p1, p2, p3, p4, - ncol = 1, - top = textGrob("Splice Junctions Distribution by Structural Category Across Cells", gp = gpar(fontsize = 18, fontface = "bold")) - ) - } - - - # NEW: RT-switching by splice junction type across cells (all and unique junctions) - if ("RTS_junction" %in% colnames(Junctions)) { - # Normalize boolean - rts_bool <- tolower(as.character(Junctions$RTS_junction)) %in% c("true", "t", "1", "yes") - junc_rt <- Junctions %>% - dplyr::filter(CB != "unassigned") %>% - mutate( - SJ_type = paste(junction_category, canonical, sep = "_"), - RTS_bool = rts_bool - ) - - # Ensure consistent SJ type levels and labels - sj_levels <- c("known_canonical", "known_non_canonical", "novel_canonical", "novel_non_canonical") - sj_labels <- c("Known\nCanonical", "Known\nNon-canonical", "Novel\nCanonical", "Novel\nNon-canonical") - junc_rt$SJ_type <- factor(junc_rt$SJ_type, levels = sj_levels) - - # Color map consistent with other SJ type plots - sj_fill_map <- c( - known_canonical = "#6BAED6", - known_non_canonical = "goldenrod1", - novel_canonical = "#78C679", - novel_non_canonical = "#FC8D59" - ) - - # Per-cell percentages for ALL junctions - all_junc_by_cell <- junc_rt %>% - group_by(CB, SJ_type) %>% - summarise(total = dplyr::n(), rts = sum(RTS_bool, na.rm = TRUE), .groups = "drop") %>% - tidyr::complete(CB, SJ_type = sj_levels, fill = list(total = 0, rts = 0)) %>% - mutate(perc = ifelse(total > 0, 100 * rts / total, 0)) - - df_long_all <- data.frame( - Variable = factor(all_junc_by_cell$SJ_type, levels = sj_levels), - Value = all_junc_by_cell$perc - ) - - gg_rts_all_by_sjtype <<- build_violin_plot( - df_long = df_long_all, - title = "RT-switching All Junctions by Splice Junction Type Across Cells", - x_labels = sj_labels, - fill_map = sj_fill_map, - y_label = "Junctions, %", - legend = FALSE, - ylim = c(0, 100), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20" - ) - - # Build a robust unique junction label if possible - if (!("junctionLabel" %in% colnames(junc_rt))) { - if (all(c("chrom", "strand", "genomic_start_coord", "genomic_end_coord") %in% colnames(junc_rt))) { - junc_rt$junctionLabel <- with(junc_rt, paste(chrom, strand, genomic_start_coord, genomic_end_coord, sep = "_")) - } else if (all(c("chrom", "strand", "genomic_start", "genomic_end") %in% colnames(junc_rt))) { - junc_rt$junctionLabel <- with(junc_rt, paste(chrom, strand, genomic_start, genomic_end, sep = "_")) - } else if ("junction_id" %in% colnames(junc_rt)) { - junc_rt$junctionLabel <- junc_rt$junction_id - } else { - # Fallback to row index within CB as unique proxy - junc_rt$junctionLabel <- paste0("jl_", seq_len(nrow(junc_rt))) - } - } - - # Per-cell percentages for UNIQUE junctions (deduplicate by genomic coordinates per cell & SJ type) - uniq_junc_by_cell <- junc_rt %>% - group_by(CB, SJ_type, junctionLabel) %>% - summarise(rts_any = any(RTS_bool, na.rm = TRUE), .groups = "drop") %>% - group_by(CB, SJ_type) %>% - summarise(total = dplyr::n(), rts = sum(rts_any), .groups = "drop") %>% - tidyr::complete(CB, SJ_type = sj_levels, fill = list(total = 0, rts = 0)) %>% - mutate(perc = ifelse(total > 0, 100 * rts / total, 0)) - - df_long_uniq <- data.frame( - Variable = factor(uniq_junc_by_cell$SJ_type, levels = sj_levels), - Value = uniq_junc_by_cell$perc - ) - - gg_rts_unique_by_sjtype <<- build_violin_plot( - df_long = df_long_uniq, - title = "RT-switching Unique Junctions by Splice Junction Type Across Cells", - x_labels = sj_labels, - fill_map = sj_fill_map, - y_label = "Junctions, %", - legend = FALSE, - ylim = c(0, 100), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20" - ) - } else { - message("RTS_junction column not found in Junctions. Skipping RT-switching by SJ type plots.") - } - - # Create grouped violins for % reads with all canonical junctions by structural category (HTML) - if (!exists("gg_allcanon_by_category")) { - cls2 <- Classification_file %>% dplyr::filter(CB != "unassigned") - status_map <- function(x) { - xch <- tolower(as.character(x)) - ifelse(xch %in% c("true", "canonical", "yes"), "True", - ifelse(xch %in% c("false", "non_canonical", "no"), "False", - ifelse(is.logical(x) && x, "True", - ifelse(is.logical(x) && !x, "False", NA_character_) - ) - ) - ) - } - cls2 <- cls2 %>% mutate(allcanon_status = status_map(all_canonical)) - - cat_key_map <- structural_category_map - all_cats <- structural_category_levels - bin_pretty_map <- c(FSM = "FSM", ISM = "ISM", NIC = "NIC", NNC = "NNC", Genic = "Genic Genomic", Antisense = "Antisense", Fusion = "Fusion", Intergenic = "Intergenic", Genic_intron = "Genic Intron") - pretty_levels <- c("FSM", "ISM", "NIC", "NNC", "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron") - - df_allcanon <- cls2 %>% - mutate(cat_key = unname(cat_key_map[structural_category])) %>% - filter(!is.na(cat_key), !is.na(allcanon_status)) %>% - group_by(CB, cat_key, allcanon_status) %>% - summarise(n = dplyr::n(), .groups = "drop") %>% - group_by(CB, cat_key) %>% - mutate(perc = 100 * n / sum(n)) %>% - ungroup() %>% - tidyr::complete(CB, cat_key = all_cats, allcanon_status = c("True", "False"), fill = list(n = 0, perc = 0)) - - if (nrow(df_allcanon) > 0) { - df_allcanon$allcanon_status <- factor(df_allcanon$allcanon_status, levels = c("True", "False")) - cols_tf <- c("True" = "#6baed6", "False" = "#ffc125") - gg_allcanon_by_category <<- build_grouped_violin_plot( - df = df_allcanon %>% transmute(bin = unname(bin_pretty_map[as.character(cat_key)]), group = as.character(allcanon_status), value = perc), - bin_levels = pretty_levels, - group_levels = c("True", "False"), - title = paste(entity_label_plural, "with All Canonical Junctions Distribution by Structural Category Across Cells"), - fill_map = cols_tf, - legend_labels = c("True" = "True", "False" = "False"), - y_label = paste(entity_label_plural, ", %", sep = ""), - ylim = c(0, 100), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.08, - x_tickangle = 45, - violin_width = 0.45, - dodge_width = 0.8, - violangap = 0.05, - violingroupgap = 0.15, - legend_title = "all_canonical" - ) - } - } - - # Coding / Non-Coding / NMD Plots - if (exists("SQANTI_cell_summary")) { - # Check if Coding columns exist - # Coding: ends with "_coding_prop" but NOT "_non_coding_prop" - all_coding_like <- grep("_coding_prop$", colnames(SQANTI_cell_summary), value = TRUE) - non_coding_cols <- grep("_non_coding_prop$", colnames(SQANTI_cell_summary), value = TRUE) - coding_cols <- setdiff(all_coding_like, non_coding_cols) - - - - # NMD - if ("NMD_prop_in_cell" %in% colnames(SQANTI_cell_summary)) { - nmd_cat_cols <- grep(".*_NMD_prop$", colnames(SQANTI_cell_summary), value = TRUE) - if (length(nmd_cat_cols) > 0) { - df_nmd <- SQANTI_cell_summary %>% - select(CB, all_of(nmd_cat_cols)) %>% - pivot_longer(cols = all_of(nmd_cat_cols), names_to = "Variable", values_to = "Value") %>% - mutate( - Variable = gsub("_NMD_prop$", "", Variable) - ) - - # Helper to map tag to pretty label - tag_to_pretty <- function(tag) { - case_map <- c( - "FSM" = "FSM", "ISM" = "ISM", "NIC" = "NIC", "NNC" = "NNC", - "genic" = "Genic Genomic", "antisense" = "Antisense", "fusion" = "Fusion", - "intergenic" = "Intergenic", "genic_intron" = "Genic Intron" - ) - if (tag %in% names(case_map)) { - return(case_map[[tag]]) - } - return(tag) - } - - df_nmd$PrettyVar <- sapply(df_nmd$Variable, tag_to_pretty) - - # Filter to all known categories (including non-canonical) - all_nmd_vars <- c("FSM", "ISM", "NIC", "NNC", "genic", "antisense", "fusion", "intergenic", "genic_intron") - df_nmd <- df_nmd %>% filter(Variable %in% all_nmd_vars) - - # Factor levels - nmd_levels <- c("FSM", "ISM", "NIC", "NNC", "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron") - df_nmd$PrettyVar <- factor(df_nmd$PrettyVar, levels = nmd_levels) - - # Use grey color for all NMD plots - nmd_fill_map <- setNames(rep("#969696", length(nmd_levels)), nmd_levels) - - gg_nmd_by_category <<- build_violin_plot( - df_long = data.frame(Variable = df_nmd$PrettyVar, Value = df_nmd$Value), - title = paste("Predicted NMD", entity_label_plural, "Distribution by Structural Category Across Cells"), - x_labels = levels(df_nmd$PrettyVar), - fill_map = nmd_fill_map, - y_label = paste(entity_label_plural, ", %", sep = ""), - legend = FALSE, - override_outline_vars = character(0), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100) - ) - } - } - } - - ### Generate PDF report ### - ########################### - - if (generate_pdf) { - pdf(file.path(paste0(report_output, ".pdf")), paper = "a4r", width = 14, height = 11, useDingbats = FALSE) - # Add cover page - grid.newpage() - title_text <- if (mode == "isoforms") "SQANTI-single cell\nisoforms report" else "SQANTI-single cell\nreads report" - cover <- textGrob(title_text, - gp = gpar(fontface = "italic", fontsize = 40, col = "orangered") - ) - grid.draw(cover) - # Bulk tables - s <- textGrob("Bulk summary", gp = gpar(fontface = "italic", fontsize = 30), vjust = 0) - grid.arrange(s) - - # Calculate bulk-level stats - if (mode == "isoforms") { - total_reads_count <- sum(Classification_file$count, na.rm = TRUE) - } else { - total_reads_count <- nrow(Classification_file) - } - unique_genes <- length(unique(Classification_file$associated_gene)) - if (mode == "isoforms") { - unique_junctions <- 0 - } else { - unique_junctions <- length(unique(Classification_file$jxn_string)) - } - - # Gene Classification table - gene_class_table <- data.frame( - Category = c("Annotated Genes", "Novel Genes"), - "Genes, count" = c( - length(unique(Classification_file$associated_gene[!grepl("^novel", Classification_file$associated_gene)])), - length(unique(Classification_file$associated_gene[grepl("^novel", Classification_file$associated_gene)])) - ), - check.names = FALSE - ) - - # Read Classification table (counts per structural category) - read_cat_levels <- c( - "full-splice_match", "incomplete-splice_match", "novel_in_catalog", "novel_not_in_catalog", - "genic", "antisense", "fusion", "intergenic", "genic_intron" - ) - read_cat_names <- c( - "FSM", "ISM", "NIC", "NNC", - "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron" - ) - - if (mode == "isoforms") { - read_class_table <- aggregate(Classification_file$count, by = list(Category = factor(Classification_file$structural_category, levels = read_cat_levels)), FUN = sum, na.rm = TRUE) - colnames(read_class_table) <- c("Category", paste0(entity_label_plural, ", count")) - # Ensure all levels are present (aggregate might drop empty ones if not careful, but factor levels help) - # Actually aggregate returns only present levels. Let's use complete. - read_class_table <- data.frame(Category = read_cat_levels) %>% - left_join(read_class_table, by = "Category") - read_class_table[is.na(read_class_table)] <- 0 - } else { - read_class_table <- as.data.frame(table(factor(Classification_file$structural_category, levels = read_cat_levels))) - colnames(read_class_table) <- c("Category", paste0(entity_label_plural, ", count")) - } - read_class_table$Category <- read_cat_names - - # Splice Junction Classification table - Junctions$junction_type <- paste(Junctions$junction_category, Junctions$canonical, sep = "_") - - sj_types <- c("known_canonical", "known_non_canonical", "novel_canonical", "novel_non_canonical") - if (mode == "isoforms") { - sj_counts <- sapply(sj_types, function(type) sum(Junctions$count[Junctions$junction_type == type], na.rm = TRUE)) - } else { - sj_counts <- sapply(sj_types, function(type) sum(Junctions$junction_type == type, na.rm = TRUE)) - } - - # Handle case where there are no junctions - total_junctions <- sum(sj_counts, na.rm = TRUE) - sj_perc <- if (total_junctions > 0) { - round(100 * sj_counts / total_junctions, 2) - } else { - rep(0, length(sj_counts)) - } - - SJ_class_table <- data.frame( - Category = c("Known canonical", "Known Non-canonical", "Novel canonical", "Novel Non-canonical"), - `SJs, count` = sj_counts, - Percent = sj_perc, - check.names = FALSE - ) - rownames(SJ_class_table) <- NULL - - big_table_theme <- ttheme_default( - core = list(fg_params = list(cex = 1.5)), - colhead = list(fg_params = list(cex = 1.5, fontface = "bold")) - ) - - title_genes <- textGrob("Gene Classification", gp = gpar(fontface = "italic", fontsize = 24), vjust = -3) - title_reads <- textGrob(paste(entity_label, "Classification"), gp = gpar(fontface = "italic", fontsize = 24), vjust = -7.7) - title_sj <- textGrob("Splice Junction Classification", gp = gpar(fontface = "italic", fontsize = 24), vjust = -4.3) - - table_genes <- tableGrob(gene_class_table, rows = NULL, theme = big_table_theme) - table_reads <- tableGrob(read_class_table, rows = NULL, theme = big_table_theme) - table_sj <- tableGrob(SJ_class_table, rows = NULL, theme = big_table_theme) - - if (unique_junctions > 0) { - unique_counts_text <- sprintf( - "Number of %s: %d\nUnique Genes: %d\nUnique Junction Chains: %d", - entity_label_plural, total_reads_count, unique_genes, unique_junctions - ) - } else { - unique_counts_text <- sprintf( - "Number of %s: %d\nUnique Genes: %d", - entity_label_plural, total_reads_count, unique_genes - ) - } - unique_counts_grob <- textGrob( - unique_counts_text, - gp = gpar(fontface = "italic", fontsize = 28), vjust = 0, hjust = 0.5 - ) - - # Create gTree objects to overlay titles and tables - gt_genes <- gTree(children = gList(table_genes, title_genes)) - gt_reads <- gTree(children = gList(table_reads, title_reads)) - gt_sj <- gTree(children = gList(table_sj, title_sj)) - - # Arrange left column: Gene Classification + Splice Junction Classification - left_col <- arrangeGrob( - gt_genes, - gt_sj, - ncol = 1, - heights = c(0.2, 0.4) - ) - - # Arrange right column: Read Classification - right_col <- arrangeGrob( - gt_reads, - ncol = 1 - ) - - # Final page layout - grid.arrange( - unique_counts_grob, - arrangeGrob(left_col, right_col, ncol = 2, widths = c(1.3, 1.3)), - nrow = 2, - heights = c(0.8, 1) - ) - - # Single cell tables - s <- textGrob("Cell summary", gp = gpar(fontface = "italic", fontsize = 30), vjust = 0) - grid.arrange(s) - - # Number of cells - num_cells <- nrow(SQANTI_cell_summary) - num_cells_grob <- textGrob( - sprintf("Unique Cell Barcodes: %d", num_cells), - gp = gpar(fontface = "italic", fontsize = 28), vjust = 0.5, hjust = 0.5 - ) - - # 1. Unique Genes and Unique Junction Chains summary table - unique_genes_stats <- c( - Mean = mean(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), - Median = median(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), - Min = min(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), - Max = max(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), - SD = sd(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE) - ) - unique_junctions_stats <- c( - Mean = mean(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), - Median = median(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), - Min = min(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), - Max = max(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), - SD = sd(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE) - ) - reads_stats <- c( - Mean = mean(SQANTI_cell_summary[[count_col]], na.rm = TRUE), - Median = median(SQANTI_cell_summary[[count_col]], na.rm = TRUE), - Min = min(SQANTI_cell_summary[[count_col]], na.rm = TRUE), - Max = max(SQANTI_cell_summary[[count_col]], na.rm = TRUE), - SD = sd(SQANTI_cell_summary[[count_col]], na.rm = TRUE) - ) - umis_stats <- c( - Mean = mean(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), - Median = median(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), - Min = min(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), - Max = max(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), - SD = sd(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE) - ) - summary_table1 <- data.frame( - Feature = c(paste(entity_label_plural, "in cell"), "UMIs in cell", "Unique Genes", "Unique Junction Chains"), - Mean = c(reads_stats["Mean"], umis_stats["Mean"], unique_genes_stats["Mean"], unique_junctions_stats["Mean"]), - Median = c(reads_stats["Median"], umis_stats["Median"], unique_genes_stats["Median"], unique_junctions_stats["Median"]), - Min = c(reads_stats["Min"], umis_stats["Min"], unique_genes_stats["Min"], unique_junctions_stats["Min"]), - Max = c(reads_stats["Max"], umis_stats["Max"], unique_genes_stats["Max"], unique_junctions_stats["Max"]), - SD = c(reads_stats["SD"], umis_stats["SD"], unique_genes_stats["SD"], unique_junctions_stats["SD"]) - ) - # If isoforms mode, drop Unique Junction Chains from summary table - if (mode == "isoforms" || !("UJCs_in_cell" %in% names(SQANTI_cell_summary)) || all(is.na(SQANTI_cell_summary$UJCs_in_cell)) || max(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE) == 0) { - summary_table1 <- summary_table1[!(summary_table1$Feature %in% c("Unique Junction Chains", "UMIs in cell")), , drop = FALSE] - } - summary_table1[, 2:6] <- round(summary_table1[, 2:6], 3) - table_summary1 <- tableGrob(summary_table1, rows = NULL, theme = big_table_theme) - gt_summary1 <- gTree(children = gList(table_summary1)) - - # 2. Gene Classification summary table (across all cells) - gene_class_stats <- data.frame( - Category = c("Annotated Genes", "Novel Genes"), - Mean = c(mean(SQANTI_cell_summary$Annotated_genes, na.rm = TRUE), mean(SQANTI_cell_summary$Novel_genes, na.rm = TRUE)), - Median = c(median(SQANTI_cell_summary$Annotated_genes, na.rm = TRUE), median(SQANTI_cell_summary$Novel_genes, na.rm = TRUE)), - Min = c(min(SQANTI_cell_summary$Annotated_genes, na.rm = TRUE), min(SQANTI_cell_summary$Novel_genes, na.rm = TRUE)), - Max = c(max(SQANTI_cell_summary$Annotated_genes, na.rm = TRUE), max(SQANTI_cell_summary$Novel_genes, na.rm = TRUE)), - SD = c(sd(SQANTI_cell_summary$Annotated_genes, na.rm = TRUE), sd(SQANTI_cell_summary$Novel_genes, na.rm = TRUE)) - ) - gene_class_stats[, 2:6] <- round(gene_class_stats[, 2:6], 3) - table_gene_class_stats <- tableGrob(gene_class_stats, rows = NULL, theme = big_table_theme) - title_gene_class_stats <- textGrob("Gene Classification (per cell)", gp = gpar(fontface = "italic", fontsize = 22), vjust = -2.9) - gt_gene_class_stats <- gTree(children = gList(table_gene_class_stats, title_gene_class_stats)) - - # 3. Splice Junction Classification summary table (across all cells) - - # Create a junction type column for easier summarization - Junctions$junction_type <- paste(Junctions$junction_category, Junctions$canonical, sep = "_") - - # Calculate proportions of each junction type per cell - junction_proportions_per_cell <- Junctions %>% - filter(CB != "unassigned") %>% - group_by(CB) %>% - summarise( - Known_canonical = sum(junction_type == "known_canonical", na.rm = TRUE) / n() * 100, - Known_Non_canonical = sum(junction_type == "known_non_canonical", na.rm = TRUE) / n() * 100, - Novel_canonical = sum(junction_type == "novel_canonical", na.rm = TRUE) / n() * 100, - Novel_Non_canonical = sum(junction_type == "novel_non_canonical", na.rm = TRUE) / n() * 100, - .groups = "drop" - ) - - # Calculate summary statistics across all cells - sj_stats <- junction_proportions_per_cell %>% - select(-CB) %>% - summarise( - across( - everything(), - list( - Mean = ~ mean(.x, na.rm = TRUE), - Median = ~ median(.x, na.rm = TRUE), - Min = ~ min(.x, na.rm = TRUE), - Max = ~ max(.x, na.rm = TRUE), - SD = ~ sd(.x, na.rm = TRUE) - ) - ) - ) - - # Reshape the data for display - sj_stats_df <- sj_stats %>% - pivot_longer( - cols = everything(), - names_to = c("Category", ".value"), - names_pattern = "(.+)_(Mean|Median|Min|Max|SD)$" - ) %>% - mutate(Category = gsub("_", " ", Category)) - - # Ensure we have the expected number of columns before subsetting - if (ncol(sj_stats_df) >= 6) { - sj_stats_df[, 2:6] <- round(sj_stats_df[, 2:6], 3) - } else { - # If we have fewer columns, round all numeric columns except the first (Category) - numeric_cols <- sapply(sj_stats_df[, -1], is.numeric) - sj_stats_df[, -1][numeric_cols] <- round(sj_stats_df[, -1][numeric_cols], 3) - } - table_sj_stats <- tableGrob(sj_stats_df, rows = NULL, theme = big_table_theme) - title_sj_stats <- textGrob("Splice Junction Classification (per cell, %)", gp = gpar(fontface = "italic", fontsize = 22), vjust = -4.4) - gt_sj_stats <- gTree(children = gList(table_sj_stats, title_sj_stats)) - - grid.arrange( - num_cells_grob, - gt_summary1, - gt_gene_class_stats, - gt_sj_stats, - ncol = 1, - heights = c(0.3, 1, 0.7, 0.9) - ) - - # Cell Summary Statistics Page 2: Read Classification - title_read_class <- textGrob(paste(entity_label, "Classification"), gp = gpar(fontface = "italic", fontsize = 28), vjust = 0, hjust = 0.5) - desc_counts <- textGrob(paste("Summary of per cell", entity_label_lower, "counts by structural category"), gp = gpar(fontface = "italic", fontsize = 18), vjust = 0.5) - desc_props <- textGrob(paste("Summary of per cell", entity_label_lower, "percentages by structural category"), gp = gpar(fontface = "italic", fontsize = 18), vjust = 0.5) - struct_cat_cols <- c( - "FSM", "ISM", "NIC", "NNC", "Genic_Genomic", "Antisense", "Fusion", "Intergenic", "Genic_intron" - ) - struct_cat_names <- c( - "FSM", "ISM", "NIC", "NNC", - "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron" - ) - - # Smaller table theme for these two tables - small_table_theme <- ttheme_default( - core = list(fg_params = list(cex = 1.2)), - colhead = list(fg_params = list(cex = 1.2, fontface = "bold")) - ) - - # 1. Counts summary table - count_stats <- sapply(struct_cat_cols, function(col) { - vals <- SQANTI_cell_summary[[col]] - c( - Mean = mean(vals, na.rm = TRUE), - Median = median(vals, na.rm = TRUE), - Min = min(vals, na.rm = TRUE), - Max = max(vals, na.rm = TRUE), - SD = sd(vals, na.rm = TRUE) - ) - }) - count_stats_df <- data.frame( - Category = struct_cat_names, - t(count_stats) - ) - colnames(count_stats_df)[2:6] <- c("Mean", "Median", "Min", "Max", "SD") - count_stats_df[, 2:6] <- round(count_stats_df[, 2:6], 3) - table_count_stats <- tableGrob(count_stats_df, rows = NULL, theme = small_table_theme) - - # 2. Proportions summary table - prop_cat_cols <- paste0(struct_cat_cols, "_prop") - prop_stats <- sapply(prop_cat_cols, function(col) { - vals <- SQANTI_cell_summary[[col]] - c( - Mean = mean(vals, na.rm = TRUE), - Median = median(vals, na.rm = TRUE), - Min = min(vals, na.rm = TRUE), - Max = max(vals, na.rm = TRUE), - SD = sd(vals, na.rm = TRUE) - ) - }) - prop_stats_df <- data.frame( - Category = struct_cat_names, - t(prop_stats) - ) - colnames(prop_stats_df)[2:6] <- c("Mean", "Median", "Min", "Max", "SD") - prop_stats_df[, 2:6] <- round(prop_stats_df[, 2:6], 3) - table_prop_stats <- tableGrob(prop_stats_df, rows = NULL, theme = small_table_theme) - - grid.arrange( - title_read_class, - desc_counts, - table_count_stats, - desc_props, - table_prop_stats, - ncol = 1, - heights = c(0.3, 0.12, 1, 0.12, 1) - ) - - # Helper for section title pages - section_page <- function(title) { - grid.newpage() - grid.draw(textGrob(title, gp = gpar(fontface = "italic", fontsize = 30, col = "black"))) - } - - # Per-cell Library Size section - section_page("Per-cell Library Size") - render_pdf_plot_centered("gg_reads_in_cells", width_frac = 0.5) - render_pdf_plot_centered("gg_umis_in_cells", width_frac = 0.5) - if (exists("gg_JCs_in_cell")) render_pdf_plot_centered("gg_JCs_in_cell", width_frac = 0.5) - - # Gene Characterization section - section_page("Gene Characterization") - # Genes Across Cells - render_pdf_plot_centered("gg_genes_in_cells", width_frac = 0.5) - render_pdf_plot("gg_annotation_of_genes_in_cell") - render_pdf_plot("gg_annotation_of_genes_percent_in_cell") - # Reads per Gene - render_pdf_plot("gg_annotation_of_reads_in_cell") - render_pdf_plot("gg_read_bins_all") - render_pdf_plot("gg_read_bins") - if (mode == "isoforms" && exists("gg_isoform_bins")) { - render_pdf_plot("gg_isoform_bins") - } - # UJCs per Gene - if (mode != "isoforms" && exists("gg_ujc_bins_all")) { - render_pdf_plot("gg_ujc_bins_all") - render_pdf_plot("gg_ujc_bins") - } - # Mitochondrial genes - render_pdf_plot("gg_MT_perc") - - # Read Length Characterization section - section_page(paste(entity_label, "Length Characterization")) - # Bulk Length Distribution - render_pdf_plot("gg_bulk_all_reads", converter = NULL) - render_pdf_plot("gg_bulk_length_by_category", converter = NULL) - render_pdf_plot("gg_bulk_length_by_exon_type", converter = NULL) - # Overall cell-level distributions: All then Mono on next page - render_pdf_plot("gg_read_distr") - render_pdf_plot("gg_read_distr_mono") - # Category-specific: print All then Mono right after - for (tag in c("FSM", "ISM", "NIC", "NNC", "genic", "antisense", "fusion", "intergenic", "genic_intron")) { - all_nm <- paste0("gg_", tag, "_read_distr") - mono_nm <- paste0("gg_", tag, "_mono_read_distr") - if (exists(all_nm)) render_pdf_plot(all_nm) - if (exists(mono_nm)) render_pdf_plot(mono_nm) - } - # Reference Transcript Coverage - render_pdf_plot("gg_ref_coverage_across_category") - - # Structural Read Characterization section - section_page(paste("Structural", entity_label, "Characterization")) - # Distribution by Structural Categories - render_pdf_plot("gg_SQANTI_across_category") - render_pdf_plot("gg_exon_mono_by_category") - for (nm in c( - "gg_SQANTI_across_FSM", "gg_SQANTI_across_ISM", "gg_SQANTI_across_NIC", "gg_SQANTI_across_NNC", - "gg_SQANTI_across_Genic", "gg_SQANTI_across_Antisense", "gg_SQANTI_across_Fusion", "gg_SQANTI_across_Intergenic", - "gg_SQANTI_across_Genic_Intron" - )) { - render_pdf_plot(nm) - } - # Exon Counts - render_pdf_plot("gg_exon_mean_by_category") - if (exists("gg_exon_profile_by_category")) { - prof_order <- c("FSM", "ISM", "NIC", "NNC", "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron") - for (nm in prof_order) { - if (!is.null(gg_exon_profile_by_category[[nm]])) { - print(plotly_to_ggplot(gg_exon_profile_by_category[[nm]])) - } - } - } - - # Coding and Non-Coding Distributions - if (!skipORF) { - if (exists("gg_coding_across_category")) render_pdf_plot("gg_coding_across_category") - if (exists("gg_non_coding_across_category")) render_pdf_plot("gg_non_coding_across_category") - } - - # Splice Junction Characterization section - # Compute per-structural-category SJ distributions for PDF pages - junc_aug <- Junctions %>% - dplyr::filter(CB != "unassigned") %>% - mutate(junction_type = paste(junction_category, canonical, sep = "_")) - - # Add count column for weighted quantification - if (mode == "isoforms" && "FL" %in% colnames(Classification_file)) { - Classification_file$count <- Classification_file$FL - } else { - Classification_file$count <- 1 - } - - if (!("structural_category" %in% colnames(junc_aug))) { - # Try to bring structural_category from Classification_file by common key + CB - join_key <- NULL - for (k in c("isoform", "readID", "read_id", "ID", "read_name", "read")) { - if (k %in% colnames(Junctions) && k %in% colnames(Classification_file)) { - join_key <- k - break - } - } - if (!is.null(join_key)) { - by_vec <- c(CB = "CB") - by_vec[[join_key]] <- join_key - junc_aug <- junc_aug %>% - left_join(Classification_file %>% select(all_of(c(join_key, "CB", "structural_category"))), by = by_vec) - } else { - junc_aug$structural_category <- NA_character_ - junc_aug$count <- 1 - } - } - - cat_key_map <- structural_category_map - all_cats <- structural_category_levels - - junc_summ <- junc_aug %>% - filter(!is.na(structural_category)) %>% - mutate(cat_key = unname(cat_key_map[structural_category])) %>% - filter(!is.na(cat_key)) %>% - group_by(CB, cat_key) %>% - summarise( - total = sum(count, na.rm = TRUE), - known_canonical = sum(count[junction_type == "known_canonical"], na.rm = TRUE), - known_non_canonical = sum(count[junction_type == "known_non_canonical"], na.rm = TRUE), - novel_canonical = sum(count[junction_type == "novel_canonical"], na.rm = TRUE), - novel_non_canonical = sum(count[junction_type == "novel_non_canonical"], na.rm = TRUE), - .groups = "drop" - ) %>% - tidyr::complete(CB, cat_key = all_cats, fill = list(total = 0, known_canonical = 0, known_non_canonical = 0, novel_canonical = 0, novel_non_canonical = 0)) %>% - mutate( - KnownCanonicalPerc = ifelse(total > 0, 100 * known_canonical / total, 0), - KnownNonCanonicalPerc = ifelse(total > 0, 100 * known_non_canonical / total, 0), - NovelCanonicalPerc = ifelse(total > 0, 100 * novel_canonical / total, 0), - NovelNonCanonicalPerc = ifelse(total > 0, 100 * novel_non_canonical / total, 0) - ) %>% - ungroup() - - # Prepare plotting helpers - fill_map_cat <- cat_fill_map - x_labels_full <- cat_labels_pretty - make_df_long <- function(col_name) { - data.frame(Variable = factor(junc_summ$cat_key, levels = all_cats), Value = junc_summ[[col_name]]) - } - - p_known_can <- build_violin_plot_ggplot( - df_long = make_df_long("KnownCanonicalPerc"), - title = "Known Canonical Splice Junctions Distribution by Structural Category Across Cells", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100) - ) - p_known_noncan <- build_violin_plot_ggplot( - df_long = make_df_long("KnownNonCanonicalPerc"), - title = "Known Non-canonical Splice Junctions Distribution by Structural Category Across Cells", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100) - ) - p_novel_can <- build_violin_plot_ggplot( - df_long = make_df_long("NovelCanonicalPerc"), - title = "Novel Canonical Splice Junctions Distribution by Structural Category Across Cells", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100) - ) - p_novel_noncan <- build_violin_plot_ggplot( - df_long = make_df_long("NovelNonCanonicalPerc"), - title = "Novel Non-canonical Splice Junctions Distribution by Structural Category Across Cells", - x_labels = x_labels_full, - fill_map = fill_map_cat, - y_label = "Junctions, %", - legend = FALSE, - override_outline_vars = c("Genic"), - violin_alpha = 0.7, - box_alpha = 0.3, - box_width = 0.05, - x_tickangle = 45, - violin_outline_fill = TRUE, - box_outline_default = "grey20", - ylim = c(0, 100) - ) - - section_page("Splice Junction Characterization") - render_pdf_plot("gg_known_novel_canon") - print(p_known_can) - print(p_known_noncan) - print(p_novel_can) - print(p_novel_noncan) - render_pdf_plot("gg_allcanon_by_category") - if (exists("gg_rts_all_by_sjtype")) render_pdf_plot("gg_rts_all_by_sjtype") - if (exists("gg_rts_unique_by_sjtype")) render_pdf_plot("gg_rts_unique_by_sjtype") - - # Features of Bad Quality section - section_page("Features of Bad Quality") - render_pdf_plot("gg_bad_feature") - render_pdf_plot("gg_intrapriming_by_category") - render_pdf_plot("gg_RTS_by_category") - - render_pdf_plot("gg_noncanon_by_category") - if (exists("gg_nmd_by_category")) render_pdf_plot("gg_nmd_by_category") - - # Features of Good Quality section - section_page("Features of Good Quality") - render_pdf_plot("gg_good_feature") - render_pdf_plot("gg_tss_annotation_support") - if (CAGE_peak) render_pdf_plot("gg_cage_peak_support") - if (polyA_motif_list) render_pdf_plot("gg_polyA_motif_support") - render_pdf_plot("gg_canon_by_category") - if (exists("gg_sr_support_by_category")) render_pdf_plot("gg_sr_support_by_category") - if (exists("gg_tss_validation_by_category")) render_pdf_plot("gg_tss_validation_by_category") - - # Clustering Analysis - if (exists("gg_umap") && !is.null(gg_umap)) { - section_page("Clustering analysis") - print(gg_umap) - - # Print UMAP by structural category if available (one per page) - if (exists("gg_umap_by_category") && !is.null(gg_umap_by_category)) { - for (cat_label in names(gg_umap_by_category)) { - print(gg_umap_by_category[[cat_label]]) - } - } - - # Print Short Read Support by Cluster (Violin + UMAPs) - if (exists("gg_sr_cluster_plots") && !is.null(gg_sr_cluster_plots)) { - - # UMAPs first (Global then Category-specific if available) - if (exists("gg_sr_umap_plots") && !is.null(gg_sr_umap_plots)) { - if (!is.null(gg_sr_umap_plots[["All Transcripts"]])) print(gg_sr_umap_plots[["All Transcripts"]]) - for (label in setdiff(names(gg_sr_umap_plots), "All Transcripts")) { - print(gg_sr_umap_plots[[label]]) - } - } - - # Violin plots - for (label in names(gg_sr_cluster_plots)) { - # Convert from plotly to ggplot for PDF - p_plotly <- gg_sr_cluster_plots[[label]] - p_ggplot <- plotly_to_ggplot(p_plotly) - print(p_ggplot) - } - } - - # Print TSS Validation Support by Cluster (Violin + UMAPs) - if (exists("gg_tss_cluster_plots") && !is.null(gg_tss_cluster_plots)) { - - # UMAPs first - if (exists("gg_tss_umap_plots") && !is.null(gg_tss_umap_plots)) { - if (!is.null(gg_tss_umap_plots[["All Transcripts"]])) print(gg_tss_umap_plots[["All Transcripts"]]) - for (label in setdiff(names(gg_tss_umap_plots), "All Transcripts")) { - print(gg_tss_umap_plots[[label]]) - } - } - - # Violin plots - for (label in names(gg_tss_cluster_plots)) { - # Convert from plotly to ggplot for PDF - p_plotly <- gg_tss_cluster_plots[[label]] - p_ggplot <- plotly_to_ggplot(p_plotly) - print(p_ggplot) - } - } - } - - dev.off() - } -} - -Classification <- data.table::fread(class.file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, data.table = FALSE) -if (mode == "isoforms" && "FL" %in% colnames(Classification)) { - Classification$count <- sapply(strsplit(as.character(Classification$FL), ","), function(x) sum(as.numeric(x), na.rm = TRUE)) - Classification$count[is.na(Classification$count) | Classification$count == 0] <- 1 -} else { - Classification$count <- 1 -} -Junctions <- data.table::fread(junc.file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, data.table = FALSE) - -# Add count column to Junctions for weighted quantification -if (mode == "isoforms") { - # Try to join by isoform ID - join_key <- NULL - for (k in c("isoform", "readID", "read_id", "ID", "read_name", "read")) { - if (k %in% colnames(Junctions) && k %in% colnames(Classification)) { - join_key <- k - break - } - } - - if (!is.null(join_key)) { - # Assign isoform count to each junction row - - - # Use match to be faster than merge/join for simple lookup - Junctions$count <- Classification$count[match(Junctions[[join_key]], Classification[[join_key]])] - # Handle NAs (shouldn't happen if consistent) - Junctions$count[is.na(Junctions$count)] <- 1 - } else { - Junctions$count <- 1 - } -} else { - Junctions$count <- 1 -} - -# Require precomputed cell summary produced by sqanti_sc.py -if (!is.null(cell_summary_path) && file.exists(cell_summary_path)) { - SQANTI_cell_summary <- data.table::fread(cell_summary_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE, data.table = FALSE) -} else { - stop("A precomputed cell summary is required. Pass --cell_summary from sqanti_sc.py.") -} - -# Generate reports based on format -if (report.format == "pdf" || report.format == "both") { - generate_sqantisc_plots( - SQANTI_cell_summary, - Classification, - Junctions, - report_output - ) -} - -if (report.format == "html" || report.format == "both") { - # Generate plots first so they're available for Rmd - if (report.format == "html") { - generate_sqantisc_plots( - SQANTI_cell_summary, - Classification, - Junctions, - report_output, - generate_pdf = FALSE - ) - } - # Set up HTML report generation - # Get the directory where this R script is located (utilities folder) - cmd_args <- commandArgs(trailingOnly = FALSE) - script_arg <- cmd_args[grep("--file=", cmd_args)] - if (length(script_arg) > 0) { - script_path <- gsub("--file=", "", script_arg) - script_dir <- dirname(normalizePath(script_path)) - } else { - stop("Cannot determine script location") - } - - rmd_file <- file.path(script_dir, "SQANTI-sc_report.Rmd") - css_file <- file.path(script_dir, "style.css") - - # Check if Rmd file exists - if (!file.exists(rmd_file)) { - stop( - "R Markdown file not found: ", rmd_file, - "\nPlease ensure SQANTI-sc_report.Rmd is in the same directory as this script." - ) - } - - # Copy CSS file to output directory if it exists - # Copy CSS file to output directory if it exists - if (file.exists(css_file)) { - css_output <- file.path(dirname(report_output), "style.css") - file.copy(css_file, css_output, overwrite = TRUE) - } - - # Generate HTML report - html_output_file <- paste0(report_output, ".html") - - message("Generating HTML report...") - - - - rmarkdown::render( - input = rmd_file, - output_file = html_output_file, - output_dir = dirname(report_output), - intermediates_dir = dirname(report_output), - quiet = FALSE, - envir = globalenv() - ) - - # Cleanup: remove the copied CSS file - if (exists("css_output") && file.exists(css_output)) { - file.remove(css_output) - } - - message("HTML report generated: ", html_output_file) -} +#!/usr/env/bin Rscript + +###################################################### +##### SQANTI single-cell reads report generation ##### +###################################################### + +### Author: Juan Francisco Cervilla & Carlos Blanco + +#********************** Packages + +suppressWarnings(suppressPackageStartupMessages({ + library(dplyr) + library(ggplot2) + library(tidyr) + library(forcats) + library(grid) + library(gridExtra) + library(rmarkdown) + library(scales) + library(data.table) +})) + +# Prevent Rplots.pdf generation +pdf(NULL) + +#********************** Taking arguments from python script + +args <- commandArgs(trailingOnly = TRUE) +class.file <- args[1] +junc.file <- args[2] +report.format <- args[3] +outputPathPrefix <- args[4] +mode <- args[5] + +# Initialize ignore_cell_summary flag +ignore_cell_summary <- FALSE +skipORF <- FALSE +CAGE_peak <- FALSE +polyA_motif_list <- FALSE +cell_summary_path <- NULL +ref_gtf_path <- NULL + +# Check for optional arguments +if (length(args) > 5) { + i <- 6 + while (i <= length(args)) { + arg <- args[i] + if (arg == "--ignore_cell_summary") { + ignore_cell_summary <- TRUE + i <- i + 1 + next + } + if (arg == "--skipORF") { + skipORF <- TRUE + i <- i + 1 + next + } + if (arg == "--CAGE_peak") { + CAGE_peak <- TRUE + i <- i + 1 + next + } + if (arg == "--polyA_motif_list") { + polyA_motif_list <- TRUE + i <- i + 1 + next + } + if (arg == "--cell_summary") { + if ((i + 1) <= length(args)) { + cell_summary_path <- args[i + 1] + i <- i + 2 + next + } else { + stop("--cell_summary requires a path argument") + } + } + if (arg == "--clustering") { + if ((i + 1) <= length(args)) { + clustering_path <- args[i + 1] + i <- i + 2 + next + } else { + stop("--clustering requires a path argument") + } + } + if (arg == "--refGTF") { + if ((i + 1) <= length(args)) { + ref_gtf_path <- args[i + 1] + i <- i + 2 + next + } else { + stop("--refGTF requires a path argument") + } + } + i <- i + 1 + } +} + +# Validate arguments +if (length(args) < 5) { + stop("Incorrect number of arguments! Required: [classification file] [junc file] [report format] [outputPathPrefix] [mode]. Abort!") +} + +if (!(report.format %in% c("pdf", "html", "both"))) { + stop("Report format needs to be: pdf, html, or both. Abort!") +} + +# Validate mode argument +if (!(mode %in% c("reads", "isoforms"))) { + stop("Mode needs to be: reads or isoforms. Abort!") +} + +# Set labels based on mode +if (mode == "isoforms") { + entity_label <- "Transcript" + entity_label_plural <- "Transcripts" +} else { + entity_label <- "Read" + entity_label_plural <- "Reads" +} + +# Lowercase versions for inline text +entity_label_lower <- tolower(entity_label) +entity_label_plural_lower <- tolower(entity_label_plural) + +# Print cell summary saving status +if (ignore_cell_summary) { + print("Cell summary table will not be saved (--ignore_cell_summary flag is active).") +} else { + print("Cell summary table will be saved.") +} + +# Call the function with the appropriate Save parameter +save_option <- ifelse(ignore_cell_summary, "N", "Y") + +# Define column names based on mode +if (mode == "isoforms") { + count_col <- "Transcripts_in_cell" + no_mono_col <- "total_transcripts_no_monoexon" +} else { + count_col <- "Reads_in_cell" + no_mono_col <- "total_reads_no_monoexon" +} + +# Generate output file names with full paths +cell_summary_output <- file.path(paste0(outputPathPrefix, "_SQANTI_cell_summary")) +report_output <- file.path(paste0(outputPathPrefix, "_SQANTI_sc_report_", mode)) +clustering_output <- file.path(dirname(outputPathPrefix), "clustering", "umap_results.csv") + +# Define standard colors +fill_color_orange <- "#CC6633" + +# Check for clustering results +gg_umap <- NULL +if (file.exists(clustering_output)) { + print(paste("Found clustering results at:", clustering_output)) + tryCatch( + { + umap_df <- read.csv(clustering_output) + umap_df$Cluster <- as.factor(umap_df$Cluster) + + gg_umap <- ggplot(umap_df, aes(x = UMAP_1, y = UMAP_2, color = Cluster)) + + geom_point(alpha = 0.6, size = 0.5) + + theme_classic() + + labs(title = "UMAP Projection", x = "UMAP 1", y = "UMAP 2") + + theme( + plot.title = element_text(size = 18, face = "bold", hjust = 0.5), + axis.title = element_text(size = 16), + axis.text.x = element_text(size = 14), + axis.text.y = element_text(size = 14), + legend.title = element_text(face = "bold"), + legend.position = "right" + ) + + guides(color = guide_legend(override.aes = list(size = 3, alpha = 1))) + }, + error = function(e) { + print(paste("Error reading clustering results:", e$message)) + } + ) +} else { + print("No clustering results found.") +} + +# ---------------------------------------------------------------- +# Helper Functions (Global Scope) +# ---------------------------------------------------------------- + +# Helper: pivot selected columns to long and return factor-ordered long df +pivot_long <- function(df, cols) { + out <- pivot_longer(df, cols = all_of(cols), names_to = "Variable", values_to = "Value") %>% + select(Variable, Value) + out$Variable <- factor(out$Variable, levels = cols) + out +} + +# Helper: generic violin + box + mean-cross plot with shared theme +build_violin_plot <- function(df_long, + title, + x_labels, + fill_map, + color_map = fill_map, + x_title = "", + y_label = paste(entity_label_plural, ", %", sep = ""), + legend = FALSE, + ylim = NULL, + override_outline_vars = character(0), + violin_alpha = 0.7, + box_alpha = 0.6, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = FALSE, + box_outline_default = "grey20", + adjust = 1, + ...) { + # Clamp values for percentage / count plots + if (grepl("%", y_label)) { + df_long$Value <- pmin(pmax(df_long$Value, 0), 100) + } else if (grepl("count", y_label, ignore.case = TRUE)) { + df_long$Value <- pmax(df_long$Value, 0) + } + + # Compute robust bandwidth for KDE + vals <- df_long$Value[is.finite(df_long$Value)] + bw_eff <- if (length(vals) >= 2) stats::bw.nrd0(vals) * adjust else 0.1 + if (is.na(bw_eff) || bw_eff <= 0) bw_eff <- 0.1 + + p <- ggplot(df_long, aes(x = Variable, y = Value)) + + # Violin layer with outline rule + { + if (isTRUE(violin_outline_fill)) { + geom_violin(aes(fill = Variable, color = Variable), alpha = violin_alpha, scale = "width", show.legend = legend, bw = bw_eff, trim = TRUE) + } else { + geom_violin(aes(fill = Variable), color = "black", alpha = violin_alpha, scale = "width", show.legend = legend, bw = bw_eff, trim = TRUE) + } + } + + scale_fill_manual(values = fill_map, labels = x_labels) + + { + if (isTRUE(violin_outline_fill)) scale_color_manual(values = fill_map, guide = "none") else NULL + } + + scale_x_discrete(labels = x_labels) + + labs(title = title, x = x_title, y = y_label) + + theme_classic(base_size = 11) + + theme( + plot.title = element_text(size = 12, face = "bold", hjust = 0.5), + axis.title = element_text(size = 12), + axis.text.y = element_text(size = 11), + axis.text.x = element_text(size = 11, angle = x_tickangle, hjust = ifelse(x_tickangle == 0, 0.5, 1)), + legend.position = if (legend) "bottom" else "none" + ) + + # Add boxplots per variable with correct outline color (grey90 overrides) + for (var in levels(df_long$Variable)) { + var_df <- df_long[df_long$Variable == var, , drop = FALSE] + box_col <- if (var %in% override_outline_vars) "grey90" else box_outline_default + p <- p + geom_boxplot( + data = var_df, + aes(x = Variable, y = Value, fill = Variable), + width = box_width, outlier.shape = NA, alpha = box_alpha, show.legend = FALSE, color = box_col, lwd = 0.3 + ) + } + + # Add mean markers on top (moved here to separate from violin layer and ensure it is on top of boxplots) + p <- p + stat_summary(fun = mean, geom = "point", shape = 4, size = 1, color = "red", stroke = 1, show.legend = FALSE) + + if (!is.null(ylim)) { + p <- p + coord_cartesian(ylim = ylim) + } + + return(p) +} + +generate_sqantisc_plots <- function(SQANTI_cell_summary, Classification_file, Junctions, report_output, generate_pdf = TRUE) { + # Helper function to mix colors + mix_color <- function(col, target, amount) { + c_rgb <- col2rgb(col) + t_rgb <- col2rgb(target) + mix <- c_rgb * (1 - amount) + t_rgb * amount + rgb(mix[1], mix[2], mix[3], maxColorValue = 255) + } + + # Generate UMAP plots by structural category if UMAP exists + if (exists("gg_umap") && !is.null(gg_umap)) { + tryCatch( + { + umap_data <- gg_umap$data + + # Merge with SQANTI_cell_summary + # umap_data has 'Barcode', SQANTI_cell_summary has 'CB' + merged_umap <- inner_join(umap_data, SQANTI_cell_summary, by = c("Barcode" = "CB")) + + if (nrow(merged_umap) > 0) { + gg_umap_by_category <<- list() + + # Define categories and their colors + cat_colors <- c( + "FSM_prop" = "#6BAED6", + "ISM_prop" = "#FC8D59", + "NIC_prop" = "#78C679", + "NNC_prop" = "#EE6A50", + "Genic_Genomic_prop" = "#969696", + "Antisense_prop" = "#66C2A4", + "Fusion_prop" = "goldenrod1", + "Intergenic_prop" = "darksalmon", + "Genic_intron_prop" = "#41B6C4" + ) + + cat_labels <- c( + "FSM_prop" = "FSM", + "ISM_prop" = "ISM", + "NIC_prop" = "NIC", + "NNC_prop" = "NNC", + "Genic_Genomic_prop" = "Genic Genomic", + "Antisense_prop" = "Antisense", + "Fusion_prop" = "Fusion", + "Intergenic_prop" = "Intergenic", + "Genic_intron_prop" = "Genic Intron" + ) + + for (cat_col in names(cat_colors)) { + if (cat_col %in% colnames(merged_umap)) { + cat_color <- cat_colors[[cat_col]] + cat_label <- cat_labels[[cat_col]] + + dark_color <- mix_color(cat_color, "black", 0.6) + light_color <- mix_color(cat_color, "white", 0.8) + + p <- ggplot(merged_umap, aes(x = UMAP_1, y = UMAP_2, color = .data[[cat_col]])) + + geom_point(alpha = 0.6, size = 0.5) + + theme_classic() + + labs(title = paste("UMAP - %", cat_label), x = "UMAP 1", y = "UMAP 2", color = paste0(entity_label_plural, ", %")) + + theme( + plot.title = element_text(size = 18, face = "bold", hjust = 0.5), + axis.title = element_text(size = 16), + axis.text.x = element_text(size = 14), + axis.text.y = element_text(size = 14), + legend.title = element_text(face = "bold"), + legend.position = "right", + legend.key.height = unit(3, "cm"), + legend.key.width = unit(1, "cm") # Thicker legend bar + ) + + scale_color_gradientn(colors = c(light_color, cat_color, dark_color)) + # Custom gradient + guides(color = guide_colorbar(barwidth = 2.5, barheight = 15)) # Make legend bar thicker and taller + + gg_umap_by_category[[cat_label]] <<- p + } + } + + # ---------------------------------------------------------------- + # Common Helpers for Cluster Violin Plots + # ---------------------------------------------------------------- + prepare_violin_data <- function(data, y_col) { + df <- data[!is.na(data[[y_col]]), c("Cluster", y_col)] + colnames(df) <- c("Variable", "Value") + df$Variable <- as.factor(df$Variable) + return(df) + } + + unique_clusters <- levels(merged_umap$Cluster) + cluster_colors <- scales::hue_pal()(length(unique_clusters)) + names(cluster_colors) <- unique_clusters + + # ---------------------------------------------------------------- + # Structural Categories Support by Cluster (Violin Plots) + # ---------------------------------------------------------------- + gg_cat_cluster_plots <<- list() + for (cat_col in names(cat_colors)) { + if (cat_col %in% colnames(merged_umap)) { + cat_color <- cat_colors[[cat_col]] + cat_label <- cat_labels[[cat_col]] + + fixed_color_map <- rep(cat_color, length(unique_clusters)) + names(fixed_color_map) <- unique_clusters + + cat_data <- prepare_violin_data(merged_umap, cat_col) + + gg_cat_cluster_plots[[cat_label]] <<- build_violin_plot( + df_long = cat_data, + title = paste(cat_label, "Distribution"), + x_labels = levels(cat_data$Variable), + fill_map = fixed_color_map, + y_label = paste(entity_label_plural, ", %", sep = ""), + legend = FALSE, + x_title = "Cluster", + x_tickangle = 0, + ylim = c(0, 100), + violin_outline_fill = TRUE, + violin_alpha = 0.7, + box_alpha = 0.3 + ) + } + } + + # ---------------------------------------------------------------- + # Length Distribution by Cluster (Violin/Box Plots) + # ---------------------------------------------------------------- + tryCatch({ + cb_cluster_map <- umap_data[, c("Barcode", "Cluster"), drop = FALSE] + cb_cluster_map <- cb_cluster_map[!is.na(cb_cluster_map$Barcode) & cb_cluster_map$Barcode != "", , drop = FALSE] + + cls_for_len <- Classification_file + cls_for_len$length_num <- suppressWarnings(as.numeric(cls_for_len$length)) + + if (mode == "isoforms" && "FL" %in% colnames(cls_for_len) && "CB" %in% colnames(cls_for_len)) { + cls_for_len$CB_raw <- as.character(cls_for_len$CB) + cls_for_len$FL_raw <- as.character(cls_for_len$FL) + cls_for_len <- tidyr::separate_rows(cls_for_len, CB_raw, FL_raw, sep = ",") + cls_for_len$FL_num <- suppressWarnings(as.numeric(trimws(cls_for_len$FL_raw))) + cls_for_len$FL_num[is.na(cls_for_len$FL_num) | cls_for_len$FL_num < 1] <- 1 + cls_for_len$CB_clean <- trimws(cls_for_len$CB_raw) + } else if ("CB" %in% colnames(cls_for_len)) { + cls_for_len$CB_clean <- as.character(cls_for_len$CB) + cls_for_len$FL_num <- 1 + } else { + cls_for_len <- data.frame() + } + + if (nrow(cls_for_len) > 0) { + cls_for_len <- merge(cls_for_len, cb_cluster_map, + by.x = "CB_clean", by.y = "Barcode", all.x = FALSE) + cls_for_len <- cls_for_len[ + !is.na(cls_for_len$length_num) & cls_for_len$length_num > 0 & + !is.na(cls_for_len$Cluster), , drop = FALSE] + + if (nrow(cls_for_len) > 0 && any(cls_for_len$FL_num > 1)) { + rep_idx <- rep(seq_len(nrow(cls_for_len)), times = as.integer(cls_for_len$FL_num)) + cls_for_len <- cls_for_len[rep_idx, , drop = FALSE] + } + + build_len_cluster_plot <- function(df_sub, title_str, colors = cluster_colors) { + if (nrow(df_sub) == 0) return(NULL) + df_sub$Cluster <- factor(df_sub$Cluster, levels = unique_clusters) + ggplot(df_sub, aes(x = Cluster, y = length_num, fill = Cluster)) + + geom_violin(aes(color = Cluster), alpha = 0.7, scale = "width", + adjust = 1.2, trim = TRUE, show.legend = FALSE) + + scale_color_manual(values = colors, guide = "none") + + geom_boxplot(width = 0.05, alpha = 0.5, outlier.shape = NA, + color = "grey20", show.legend = FALSE) + + stat_summary(fun = mean, geom = "point", shape = 4, + size = 1, color = "red", stroke = 1, show.legend = FALSE) + + scale_fill_manual(values = colors) + + scale_y_log10(labels = scales::comma) + + labs( + title = title_str, + x = "Cluster", + y = if (mode == "isoforms") paste(entity_label_plural, "Length (bp, log10)") + else "Feature Length (bp, log10)" + ) + + theme_classic(base_size = 11) + + theme( + plot.title = element_text(size = 12, face = "bold", hjust = 0.5), + axis.title = element_text(size = 12), + axis.text.y = element_text(size = 11), + axis.text.x = element_text(size = 11), + legend.position = "none" + ) + } + + all_label <- if (mode == "isoforms") "All Transcripts" else "All Reads" + gg_len_cluster_plots <<- list() + gg_len_cluster_plots[[all_label]] <<- build_len_cluster_plot( + cls_for_len, paste(all_label, "Length Distribution by Cluster") + ) + + cat_sc_map <- c( + "FSM" = "full-splice_match", + "ISM" = "incomplete-splice_match", + "NIC" = "novel_in_catalog", + "NNC" = "novel_not_in_catalog", + "Genic Genomic" = "genic", + "Antisense" = "antisense", + "Fusion" = "fusion", + "Intergenic" = "intergenic", + "Genic Intron" = "genic_intron" + ) + for (lbl in names(cat_sc_map)) { + # Find the matching cat_colors key (e.g. "FSM_prop" for lbl "FSM") + cat_col_key <- names(cat_colors)[match(lbl, cat_labels)] + cat_hex <- if (!is.na(cat_col_key) && cat_col_key %in% names(cat_colors)) + cat_colors[[cat_col_key]] else "#888888" + fixed_cat_colors <- rep(cat_hex, length(unique_clusters)) + names(fixed_cat_colors) <- unique_clusters + + sub_df <- cls_for_len[ + !is.na(cls_for_len$structural_category) & + cls_for_len$structural_category == cat_sc_map[[lbl]], , drop = FALSE] + if (nrow(sub_df) > 0) { + gg_len_cluster_plots[[lbl]] <<- build_len_cluster_plot( + sub_df, paste(lbl, "Length Distribution by Cluster"), + colors = fixed_cat_colors + ) + } + } + } + }, error = function(e) { + message("Could not build length-by-cluster plots: ", e$message) + }) + + # ---------------------------------------------------------------- + # Short Read Support by Cluster (Violin Plots) + # ---------------------------------------------------------------- + # Use the new column name: srjunctions_support_prop + if ("srjunctions_support_prop" %in% colnames(merged_umap) && sum(merged_umap$srjunctions_support_prop, na.rm = TRUE) > 0) { + gg_sr_cluster_plots <<- list() + + # ---------------------------------------------------------------- + # TSS Ratio Validated Support by Cluster (Violin Plots) + # ---------------------------------------------------------------- + if ("TSS_ratio_validated_prop" %in% colnames(merged_umap) && sum(merged_umap$TSS_ratio_validated_prop, na.rm = TRUE) > 0) { + gg_tss_cluster_plots <<- list() + + # 1. All Transcripts Plot - TSS + # Use Cluster Colors (same as Junctions Coverage) + p_all_tss <- build_violin_plot( + df_long = prepare_violin_data(merged_umap, "TSS_ratio_validated_prop"), + title = "All Transcripts TSS Validation by Short Reads", + x_labels = levels(merged_umap$Cluster), + fill_map = cluster_colors, + x_title = "Cluster", + y_label = "TSS Ratio Validated, %", + x_tickangle = 0, + violin_outline_fill = TRUE, + violin_alpha = 0.7, + box_alpha = 0.3 + ) + gg_tss_cluster_plots[["All Transcripts"]] <<- p_all_tss + + # 2. Per-Category Plots - TSS + # Use Category Color for ALL clusters + for (cat_col in names(cat_colors)) { + tag <- cat_labels[[cat_col]] + prop_col <- paste0(tag, "_TSS_ratio_validated_prop") + if (tag == "Genic Genomic") prop_col <- "Genic_TSS_ratio_validated_prop" # Handle Genic weirdness if needed + if (tag == "Genic Intron") prop_col <- "Genic_intron_TSS_ratio_validated_prop" + + # Clean up tag to match column naming convention if straightforward + simple_tag <- names(cat_labels)[which(cat_labels == tag)] + simple_tag <- gsub("_prop", "", simple_tag) + prop_col <- paste0(simple_tag, "_TSS_ratio_validated_prop") + + if (prop_col %in% colnames(merged_umap)) { + # Define single color map + current_cat_color <- cat_colors[[cat_col]] + fixed_color_map <- rep(current_cat_color, length(unique_clusters)) + names(fixed_color_map) <- unique_clusters + + p_cat_tss <- build_violin_plot( + df_long = prepare_violin_data(merged_umap, prop_col), + title = paste(tag, "TSS Validation by Short Reads"), + x_labels = levels(merged_umap$Cluster), + fill_map = fixed_color_map, + x_title = "Cluster", + y_label = "TSS Ratio Validated, %", + x_tickangle = 0, + violin_outline_fill = TRUE, + violin_alpha = 0.7, + box_alpha = 0.3 + ) + gg_tss_cluster_plots[[tag]] <<- p_cat_tss + } + } + } + + # ---------------------------------------------------------------- + # Short Read Support by Cluster (Violin Plots) + # ---------------------------------------------------------------- + # 1. Global Plot + global_data <- prepare_violin_data(merged_umap, "srjunctions_support_prop") + + # Use build_violin_plot (which returns a ggplot object) + gg_sr_cluster_plots[["All Transcripts"]] <<- build_violin_plot( + df_long = global_data, + title = "All Transcripts Junction Coverage by Short Reads", + x_labels = levels(global_data$Variable), + fill_map = cluster_colors, + y_label = "Transcripts Supported, %", + legend = FALSE, + x_title = "Cluster", + x_tickangle = 0, + ylim = c(0, 100), + violin_outline_fill = TRUE, + violin_alpha = 0.7, + box_alpha = 0.3 + ) + + # 2. Per-Category Plots + # For these, we want to maintain the specific Structural Category color Scheme? + # The user said: "the colors used for the violins and boxes should be the same in each structural category and the color should be the corresponding to the structural category." + # This implies that for the FSM plot, ALL clusters should be colored with the FSM color. + + for (cat_col in names(cat_labels)) { + # cat_labels[[cat_col]] is e.g. "FSM", "Genic Genomic" + tag <- cat_labels[[cat_col]] + tag_clean <- gsub(" ", "_", tag) + # New column name format: {TAG}_srjunctions_support_prop + sr_col <- paste0(tag_clean, "_srjunctions_support_prop") + + if (sr_col %in% colnames(merged_umap)) { + cat_data <- prepare_violin_data(merged_umap, sr_col) + + current_cat_color <- cat_colors[[cat_col]] + fixed_color_map <- rep(current_cat_color, length(unique_clusters)) + names(fixed_color_map) <- unique_clusters + + title <- paste(tag, "Junction Coverage by Short Reads") + + gg_sr_cluster_plots[[tag]] <<- build_violin_plot( + df_long = cat_data, + title = title, + x_labels = levels(cat_data$Variable), + fill_map = fixed_color_map, # Per-category uses the category color for all clusters + y_label = "Transcripts Supported, %", + legend = FALSE, + x_title = "Cluster", + x_tickangle = 0, + ylim = c(0, 100), + violin_outline_fill = TRUE, + violin_alpha = 0.7, + box_alpha = 0.3 + ) + } + } + } + } + }, + error = function(e) { + print(paste("Error generating UMAP by category plots:", e$message)) + } + ) + } + + # ---------------------------------------------------------------- + # Helper: Build Continuous UMAP (for coloring by %) + # ---------------------------------------------------------------- + build_continuous_umap <- function(data, color_col, title, color_base = "blue") { + mix_color <- function(col, target, amount) { + c_rgb <- col2rgb(col) + t_rgb <- col2rgb(target) + mix <- c_rgb * (1 - amount) + t_rgb * amount + rgb(mix[1], mix[2], mix[3], maxColorValue = 255) + } + + if (!all(c("UMAP_1", "UMAP_2") %in% colnames(data))) { + return(NULL) + } + + plot_data <- data[!is.na(data[[color_col]]), ] + if (nrow(plot_data) == 0) { + return(NULL) + } + + dark_color <- mix_color(color_base, "black", 0.6) + light_color <- mix_color(color_base, "white", 0.8) + + p <- ggplot(plot_data, aes(x = UMAP_1, y = UMAP_2, color = .data[[color_col]])) + + geom_point(alpha = 0.6, size = 0.5) + + scale_color_gradientn( + colors = c(light_color, color_base, dark_color), + limits = c(0, max(plot_data[[color_col]], na.rm = TRUE)) + ) + + guides(color = guide_colorbar(barwidth = 2.5, barheight = 15)) + + labs(title = title, x = "UMAP 1", y = "UMAP 2", color = paste0(entity_label_plural, ", %")) + + theme_classic() + + theme( + plot.title = element_text(hjust = 0.5, face = "bold", size = 14), + axis.title = element_text(size = 16), + axis.text.x = element_text(size = 14), + axis.text.y = element_text(size = 14), + legend.position = "right", + legend.title = element_text(face = "bold"), + legend.key.height = unit(3, "cm"), + legend.key.width = unit(1, "cm") + ) + return(p) + } + + # ---------------------------------------------------------------- + # Short Read (SJ) Validation UMAPs + # ---------------------------------------------------------------- + gg_sr_umap_plots <<- list() + if (exists("merged_umap") && "srjunctions_support_prop" %in% colnames(merged_umap)) { + # Global + gg_sr_umap_plots[["All Transcripts"]] <<- build_continuous_umap( + merged_umap, + "srjunctions_support_prop", + "All Transcripts Junction Coverage by Short Reads", + color_base = "#cd4f39" + ) + + # Per-Category + for (cat_col in names(cat_labels)) { + tag <- cat_labels[[cat_col]] + tag_clean <- gsub(" ", "_", tag) + sr_col <- paste0(tag_clean, "_srjunctions_support_prop") + + if (sr_col %in% colnames(merged_umap)) { + gg_sr_umap_plots[[tag]] <<- build_continuous_umap( + merged_umap, + sr_col, + paste(tag, "Junction Coverage by Short Reads"), + color_base = cat_colors[[cat_col]] + ) + } + } + } + + # ---------------------------------------------------------------- + # TSS Validation UMAPs + # ---------------------------------------------------------------- + gg_tss_umap_plots <<- list() + if (exists("merged_umap") && "TSS_ratio_validated_prop" %in% colnames(merged_umap)) { + # Global + gg_tss_umap_plots[["All Transcripts"]] <<- build_continuous_umap( + merged_umap, + "TSS_ratio_validated_prop", + "All Transcripts TSS Validation by Short Reads", + color_base = "#ffc125" + ) + + # Per-Category + for (cat_col in names(cat_labels)) { + tag <- cat_labels[[cat_col]] + tag_clean <- gsub(" ", "_", tag) + # Handle special cases if any (e.g. Genic Genomic) + simple_tag <- names(cat_labels)[which(cat_labels == tag)] + simple_tag <- gsub("_prop", "", simple_tag) + tss_col <- paste0(simple_tag, "_TSS_ratio_validated_prop") + + if (tss_col %in% colnames(merged_umap)) { + gg_tss_umap_plots[[tag]] <<- build_continuous_umap( + merged_umap, + tss_col, + paste(tag, "TSS Validation by Short Reads"), + color_base = cat_colors[[cat_col]] + ) + } + } + } + + # Helper: grouped violins by bin with legend (Annotated/Novel) using ggplot, rebuildable for PDF + # df must contain columns: bin, group, value + build_grouped_violin_plot <- function(df, + bin_levels, + group_levels, + title, + fill_map, + legend_labels, + y_label = "Genes, %", + ylim = c(0, 100), + violin_alpha = 0.5, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_width = 0.45, + dodge_width = 0.8, + violangap = 0.05, + violingroupgap = 0.15, + legend_title = NULL) { + # Ensure factors + df$bin <- factor(df$bin, levels = bin_levels) + df$group <- factor(df$group, levels = group_levels) + + # Clamp values to ylim range + df$value <- pmin(pmax(df$value, ylim[1]), ylim[2]) + + # Shared bandwidth + valid_vals <- df$value[is.finite(df$value)] + bw_eff <- if (length(valid_vals) >= 2) stats::bw.nrd0(valid_vals) else 0.1 + if (is.na(bw_eff) || bw_eff <= 0) bw_eff <- 0.1 + + p <- ggplot(df, aes(x = bin, y = value, fill = group)) + + geom_violin(aes(color = group), + alpha = violin_alpha, + position = position_dodge(width = dodge_width), + scale = "width", + trim = TRUE, + bw = bw_eff, + linewidth = 0.3, + show.legend = TRUE + ) + + geom_boxplot(aes(group = interaction(bin, group)), + width = box_width, + outlier.shape = NA, + fill = NA, + color = "grey20", + alpha = box_alpha, + position = position_dodge(width = dodge_width), + show.legend = FALSE + ) + + stat_summary(aes(group = group), + fun = mean, geom = "point", shape = 4, size = 1, + colour = "red", stroke = 0.9, + position = position_dodge(width = dodge_width), + show.legend = FALSE + ) + + scale_fill_manual(values = fill_map, labels = legend_labels) + + scale_color_manual(values = fill_map, guide = "none") + + labs(title = title, x = "", y = y_label) + + theme_classic(base_size = 11) + + theme( + plot.title = element_text(size = 12, face = "bold", hjust = 0.5), + axis.title = element_text(size = 12), + axis.text.y = element_text(size = 11), + axis.text.x = element_text( + size = 11, + angle = if (!is.null(x_tickangle) && x_tickangle != 0) x_tickangle else 0, + hjust = ifelse(!is.null(x_tickangle) && x_tickangle == 0, 0.5, 1) + ), + legend.position = "bottom", + legend.title = element_blank() + ) + + if (!is.null(ylim)) { + p <- p + coord_cartesian(ylim = ylim) + } + return(p) + } + `%||%` <- function(x, y) if (is.null(x)) y else x + assign_plot <- function(name, plot) assign(name, plot, envir = .GlobalEnv) + build_violin_from_long <- function(df_long, args) { + do.call(build_violin_plot, c(list(df_long = df_long), args)) + } + single_violin <- function(df, cfg) { + var <- cfg$column + df_long <- data.frame(Variable = factor(var, levels = var), Value = df[[var]]) + fill_map <- setNames(cfg$fill, var) + base_args <- list( + title = cfg$title, + x_labels = cfg$x_labels %||% cfg$x_label, + fill_map = fill_map, + legend = cfg$legend %||% FALSE + ) + if (!is.null(cfg$y_label)) base_args$y_label <- cfg$y_label + plot_args <- c(base_args, cfg$plot_args %||% list()) + assign_plot(cfg$name, build_violin_from_long(df_long, plot_args)) + } + pivot_violin <- function(df, cfg) { + df_long <- pivot_long(df, cfg$columns) + fill_map <- cfg$fill_map %||% setNames(rep(cfg$fill, length(cfg$columns)), cfg$columns) + base_args <- list( + title = cfg$title, + x_labels = cfg$x_labels, + fill_map = fill_map, + legend = cfg$legend %||% FALSE + ) + if (!is.null(cfg$y_label)) base_args$y_label <- cfg$y_label + plot_args <- c(base_args, cfg$plot_args %||% list()) + assign_plot(cfg$name, build_violin_from_long(df_long, plot_args)) + } + render_pdf_plot <- function(name) { + if (exists(name)) print(get(name)) + } + + render_pdf_plot_centered <- function(name, width_frac = 0.45) { + if (!exists(name)) { + return(invisible(NULL)) + } + p <- get(name) + g <- if (inherits(p, "grob")) p else ggplotGrob(p) + left_right <- (1 - width_frac) / 2 + grid.arrange(nullGrob(), g, nullGrob(), widths = c(left_right, width_frac, left_right), newpage = TRUE) + } + + # Helper: build length-distribution violins for given column prefix + # If mono=TRUE, uses *_length_mono_prop columns; otherwise *_length_prop + build_len_violin_for_prefix <- function(df, prefix, title, fill_color, box_fill = NULL, mono = FALSE, box_outline_color = "grey20", violin_alpha = 0.5, box_alpha = 0.3, violin_outline_fill = FALSE, format = "ggplot") { + suffix <- if (mono) "_length_mono_prop" else "_length_prop" + cols <- paste0(prefix, c("_250b", "_500b", "_short", "_mid", "_long"), suffix) + cols <- cols[cols %in% colnames(df)] + if (length(cols) == 0) { + return(NULL) + } + + df_long <- pivot_long(df, cols) + fill_map <- setNames(rep(fill_color, length(cols)), cols) + color_map <- setNames(rep(fill_color, length(cols)), cols) + x_labels <- c("0-250bp", "250-500bp", "500-1000bp", "1000-2000bp", ">2000bp") + names(x_labels) <- cols + + build_violin_plot( + df_long = df_long, + title = title, + x_labels = x_labels, + fill_map = fill_map, + color_map = color_map, + y_label = paste(entity_label_plural, ", %", sep = ""), + legend = FALSE, + violin_alpha = violin_alpha, + box_alpha = box_alpha, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = violin_outline_fill, + box_outline_default = box_outline_color + ) + } + + # Meta-transcript body coverage profile plot + # Shows where reads cover their reference transcripts (0%=5' to 100%=3'), + # one line per reference transcript length bin. + build_meta_coverage_plot <- function(cls_df, n_bins = 100) { + required_cols <- c("diff_to_TSS", "diff_to_TTS", "ref_length", "length") + if (!all(required_cols %in% colnames(cls_df))) { + return(NULL) + } + + keep_cols <- c( + required_cols, + if ("count" %in% colnames(cls_df)) "count", + if ("structural_category" %in% colnames(cls_df)) "structural_category" + ) + df <- cls_df[, keep_cols, drop = FALSE] + for (col in required_cols) df[[col]] <- suppressWarnings(as.numeric(df[[col]])) + if (!("count" %in% colnames(df))) df$count <- 1L + df$count <- suppressWarnings(as.numeric(df$count)) + df$count[is.na(df$count) | df$count <= 0] <- 1 + df <- df[complete.cases(df[, required_cols]) & df$ref_length > 0, ] + if (nrow(df) == 0) { + return(NULL) + } + + # Compute coverage fractions (fraction of ref body covered by each read) + # diff_to_TSS: positive = read starts upstream of ref TSS, negative = downstream + # diff_to_TTS: positive = read ends downstream of ref TTS, negative = upstream + # + # For FSM: diffs are within terminal exons, so genomic = transcript space. + # Use exact formula: start = -diff_to_TSS / ref_length + # For ISM: diffs may cross introns, so genomic >> transcript space. + # Use length/ref_length ratio + sign-based positioning: + # - 5' only truncated: covers (1-ratio) to 1 + # - 3' only truncated: covers 0 to ratio + # - Both truncated: use genomic ratio to split the gap (approximation) + is_fsm <- !is.na(df$structural_category) & df$structural_category == "full-splice_match" + cov_ratio <- pmin(1, df$length / df$ref_length) + + # FSM: exact formula (diffs within terminal exons = transcript space) + df$start_frac <- ifelse(is_fsm, + pmax(0, pmin(1, -df$diff_to_TSS / df$ref_length)), + NA_real_ + ) + df$end_frac <- ifelse(is_fsm, + pmax(0, pmin(1, 1 + df$diff_to_TTS / df$ref_length)), + NA_real_ + ) + + # ISM: sign-based positioning with length/ref_length ratio + is_ism <- !is_fsm + tss_neg <- df$diff_to_TSS < 0 # 5' truncated + tts_neg <- df$diff_to_TTS < 0 # 3' truncated + + # ISM: only 3' truncated (starts at ref TSS, ends early) + idx <- is_ism & !tss_neg & tts_neg + df$start_frac[idx] <- 0 + df$end_frac[idx] <- cov_ratio[idx] + + # ISM: only 5' truncated (starts late, ends at ref TTS) + idx <- is_ism & tss_neg & !tts_neg + df$start_frac[idx] <- 1 - cov_ratio[idx] + df$end_frac[idx] <- 1 + + # ISM: both truncated — use genomic ratio to distribute the gap + idx <- is_ism & tss_neg & tts_neg + total_gap <- pmax(0, df$ref_length[idx] - df$length[idx]) + abs_tss <- abs(df$diff_to_TSS[idx]) + abs_tts <- abs(df$diff_to_TTS[idx]) + genomic_total <- abs_tss + abs_tts + five_prime_share <- ifelse(genomic_total > 0, abs_tss / genomic_total, 0.5) + df$start_frac[idx] <- pmin(1, total_gap * five_prime_share / df$ref_length[idx]) + df$end_frac[idx] <- pmin(1, df$start_frac[idx] + cov_ratio[idx]) + + # ISM: neither truncated (rare, but handle: full coverage) + idx <- is_ism & !tss_neg & !tts_neg + df$start_frac[idx] <- 0 + df$end_frac[idx] <- pmin(1, cov_ratio[idx]) + + # Safety: keep only rows where end > start + df <- df[df$end_frac > df$start_frac, ] + if (nrow(df) == 0) { + return(NULL) + } + + # Bin by reference transcript length + df$len_bin <- cut(df$ref_length, + breaks = c(0, 250, 500, 1000, 2000, 5000, Inf), + labels = c("0-250bp", "250-500bp", "500bp-1kb", "1-2kb", "2-5kb", ">5kb"), + right = FALSE + ) + + # For each bin, compute weighted % of reads covering each position + bin_positions <- seq(0, 1, length.out = n_bins + 1) + bin_mids <- (bin_positions[-length(bin_positions)] + bin_positions[-1]) / 2 + + profile_list <- lapply(levels(df$len_bin), function(lb) { + sub <- df[df$len_bin == lb, ] + if (nrow(sub) == 0) { + return(NULL) + } + total_weight <- sum(sub$count) + coverage <- vapply(seq_len(n_bins), function(i) { + pos <- bin_mids[i] + sum(sub$count[sub$start_frac <= pos & sub$end_frac >= pos]) / total_weight * 100 + }, numeric(1)) + data.frame( + position = bin_mids * 100, + coverage = coverage, + len_bin = lb, + n_reads = total_weight, + stringsAsFactors = FALSE + ) + }) + profile_df <- do.call(rbind, Filter(Negate(is.null), profile_list)) + if (is.null(profile_df) || nrow(profile_df) == 0) { + return(NULL) + } + + profile_df$len_bin <- factor(profile_df$len_bin, levels = levels(df$len_bin)) + + # RColorConesa main palette (7 discrete base colors) + n_levels <- nlevels(df$len_bin) + bin_colors <- c("#00B0A5", "#E1744E", "#FAC24A", "#6DC8E5", "#E7A5CB", "#9C8AB4", "#E44067")[seq_len(n_levels)] + + p <- ggplot(profile_df, aes(x = position, y = coverage, color = len_bin)) + + geom_line(linewidth = 0.9, alpha = 0.85) + + scale_color_manual(values = setNames(bin_colors, levels(profile_df$len_bin))) + + scale_x_continuous(breaks = seq(0, 100, by = 10), limits = c(0, 100)) + + scale_y_continuous(limits = c(0, 100)) + + labs( + title = paste(entity_label, "Body Coverage Along Reference Transcript"), + x = "Position along reference transcript (%)\n5' -> 3'", + y = paste(entity_label_plural, "covering position, %"), + color = "Reference length" + ) + + theme_classic(base_size = 14) + + theme( + plot.title = element_text(size = 16, face = "bold", hjust = 0.5), + axis.title = element_text(size = 14), + axis.text = element_text(size = 12), + legend.position = "bottom", + legend.title = element_text(face = "bold", size = 11), + legend.text = element_text(size = 10) + ) + + guides(color = guide_legend(nrow = 2, byrow = TRUE)) + return(p) + } + + # Build GTF reference length violin/boxplot comparison (works for both 'isoforms' and 'reads' modes) + build_ref_vs_sample_lengths <- function(cls_df, ref_gtf, mode = "isoforms") { + if (is.null(ref_gtf)) { + return(NULL) + } + + # Strip quotes if passed by the shell + ref_gtf <- gsub('^"|"$', "", ref_gtf) + + if (!file.exists(ref_gtf)) { + return(NULL) + } + + # 1. Sample Transcripts + if (!"length" %in% colnames(cls_df)) { + return(NULL) + } + sample_lengths <- suppressWarnings(as.numeric(cls_df$length)) + sample_lengths <- sample_lengths[!is.na(sample_lengths) & sample_lengths > 0] + + if (length(sample_lengths) == 0) { + return(NULL) + } + + sample_label <- if (mode == "isoforms") "Sample Transcriptome" else "Sample Reads" + sample_df <- data.frame( + length = sample_lengths, + Dataset = sample_label, + stringsAsFactors = FALSE + ) + + # 2. Reference Transcripts via data.table rapid GTF parsing + # Skip comment lines explicitly to prevent fread from guessing incorrect column number + skip_lines <- 0 + con <- try(file(ref_gtf, "r"), silent = TRUE) + if (!inherits(con, "try-error")) { + while (TRUE) { + line <- suppressWarnings(readLines(con, n = 1)) + if (length(line) == 0) break + if (startsWith(line, "#")) { + skip_lines <- skip_lines + 1 + } else { + break + } + } + close(con) + } + + gtf <- tryCatch( + data.table::fread(ref_gtf, sep = "\t", skip = skip_lines, header = FALSE, fill = TRUE, quote = ""), + error = function(e) NULL + ) + if (is.null(gtf) || nrow(gtf) == 0 || ncol(gtf) < 9) { + return(NULL) + } + + # V3 = feature, V4 = start, V5 = end, V9 = attributes + exons <- gtf[V3 == "exon"] + if (nrow(exons) == 0) { + return(NULL) + } + + exons[, start_pos := suppressWarnings(as.numeric(V4))] + exons[, end_pos := suppressWarnings(as.numeric(V5))] + exons <- exons[!is.na(start_pos) & !is.na(end_pos)] + exons[, exon_length := end_pos - start_pos + 1] + + # Extract transcript_id via regex + exons[, transcript_id := sub(".*transcript_id \"([^\"]+)\".*", "\\1", V9)] + + # Aggregate by transcript + ref_lengths_dt <- exons[, .(length = sum(exon_length, na.rm = TRUE)), by = transcript_id] + ref_lengths <- ref_lengths_dt$length[ref_lengths_dt$length > 0] + + if (length(ref_lengths) == 0) { + return(NULL) + } + + ref_df <- data.frame( + length = ref_lengths, + Dataset = "Reference Transcriptome", + stringsAsFactors = FALSE + ) + + # 3. Combine and Plot + plot_df <- rbind(sample_df, ref_df) + dataset_levels <- c("Reference Transcriptome", sample_label) + plot_df$Dataset <- factor(plot_df$Dataset, levels = dataset_levels) + pal <- setNames(c("#1fa291", "#f5c05d"), dataset_levels) + y_axis_label <- if (mode == "isoforms") paste(entity_label_plural, "Length (bp, log10)") else "Feature Length (bp, log10)" + plot_subtitle <- if (mode == "isoforms") "Reference transcriptome vs. Sample transcriptome" else "Reference transcriptome vs. Sample reads" + + p <- ggplot(plot_df, aes(x = Dataset, y = length, fill = Dataset)) + + geom_violin(aes(color = Dataset), alpha = 0.7, scale = "width", adjust = 1.5, trim = TRUE, show.legend = FALSE) + + scale_color_manual(values = pal, guide = "none") + + geom_boxplot(width = 0.05, alpha = 0.6, outlier.shape = NA, color = "grey20", show.legend = FALSE) + + stat_summary(fun = mean, geom = "point", shape = 4, size = 1, color = "red", stroke = 1, show.legend = FALSE) + + scale_y_log10(labels = scales::comma) + + scale_fill_manual(values = pal) + + labs( + title = paste0("Transcript Length Distribution:\n", plot_subtitle), + x = "", + y = y_axis_label + ) + + theme_classic(base_size = 11) + + theme( + plot.title = element_text(size = 12, face = "bold", hjust = 0.5), + axis.title = element_text(size = 12), + axis.text.y = element_text(size = 11), + axis.text.x = element_text(size = 11, angle = 45, hjust = 1) + ) + + return(p) + } + + # Helper: build a per-category exon count profile (median + IQR across cells) + build_exon_profile_plot <- function(df_prof, title, line_color, k_max = 20, y_label = paste(entity_label_plural, ", %", sep = ""), n_cells = NULL) { + # Sanitize title + title <- tryCatch( + { + t <- as.character(title) + trimws(gsub("\n.*", "", t)) + }, + error = function(e) "Exon Profile" + ) + + # Canonical Fusion color override + FUSION_COLOR <- "#F1C40F" + detect_fusion <- function(df) { + tryCatch( + { + ttl_has <- grepl("Fusion", title, ignore.case = TRUE) + in_df <- "category" %in% colnames(df) && any(grepl("Fusion", df$category, ignore.case = TRUE)) + ttl_has || in_df + }, + error = function(e) FALSE + ) + } + if (detect_fusion(df_prof)) { + line_color <- FUSION_COLOR + } + + # Helpers: color utilities + lighten_hex <- function(hex, amount = 0.35) { + rgb <- grDevices::col2rgb(hex) + r <- as.integer(round(rgb[1] + (255 - rgb[1]) * amount)) + g <- as.integer(round(rgb[2] + (255 - rgb[2]) * amount)) + b <- as.integer(round(rgb[3] + (255 - rgb[3]) * amount)) + grDevices::rgb(r, g, b, maxColorValue = 255) + } + + if (is.null(df_prof) || nrow(df_prof) == 0 || all(!is.finite(df_prof$median))) { + p_empty <- ggplot() + + labs(title = title, subtitle = "No data available for this category") + + theme_minimal(base_size = 14) + + theme( + plot.title = element_text(size = 16, face = "bold", hjust = 0.5), + plot.subtitle = element_text(size = 12, hjust = 0.5, color = "gray") + ) + return(p_empty) + } + + tick_breaks <- seq_len(k_max) + label_last <- paste0("\u2265", k_max) + ticktexts <- c(as.character(seq_len(k_max - 1)), label_last) + + # Choose center line: use mean if present, else median + y_center <- if (!is.null(df_prof$mean)) df_prof$mean else df_prof$median + + # Build stat columns for the line/ribbon + stat_cols <- intersect(colnames(df_prof), c("mean", "median")) + line_stats <- if (length(stat_cols)) { + df_prof %>% + dplyr::select(k, dplyr::all_of(stat_cols)) %>% + tidyr::pivot_longer(cols = dplyr::all_of(stat_cols), names_to = "stat", values_to = "value") %>% + dplyr::filter(!is.na(value)) %>% + dplyr::mutate(stat = factor(stat, levels = stat_cols)) + } else { + NULL + } + + iqr_fill <- lighten_hex(line_color, 0.55) + + p <- ggplot(df_prof, aes(x = k)) + + geom_ribbon(aes(ymin = q1, ymax = q3, fill = "IQR"), alpha = 0.25, show.legend = TRUE, key_glyph = "rect") + + scale_y_continuous(limits = c(0, 100)) + + scale_x_continuous(breaks = tick_breaks, labels = ticktexts) + + scale_fill_manual(values = c("IQR" = iqr_fill), name = "") + + labs(title = title, x = "Number of exons", y = y_label) + + theme_classic(base_size = 14) + + theme( + plot.title = element_text(size = 16, face = "bold", hjust = 0.5), + axis.title = element_text(size = 14), + axis.text.x = element_text(size = 12), + axis.text.y = element_text(size = 12), + legend.position = "bottom" + ) + + if (!is.null(line_stats) && nrow(line_stats) > 0) { + line_palette <- setNames(rep(line_color, length(stat_cols)), stat_cols) + linetype_values <- c("mean" = "solid", "median" = "dashed") + legend_linewidths <- c("mean" = 1.2, "median" = 1.0) + if (!"mean" %in% stat_cols) { + linetype_values <- linetype_values[names(linetype_values) != "mean"] + legend_linewidths <- legend_linewidths[names(legend_linewidths) != "mean"] + } + if (!"median" %in% stat_cols) { + linetype_values <- linetype_values[names(linetype_values) != "median"] + legend_linewidths <- legend_linewidths[names(legend_linewidths) != "median"] + } + p <- p + + geom_line(data = line_stats, aes(x = k, y = value, linetype = stat, color = stat, linewidth = stat)) + + scale_color_manual(values = line_palette, name = "") + + scale_linetype_manual(values = linetype_values, name = "") + + scale_linewidth_manual(values = legend_linewidths, name = "") + } + + return(p) + } + + # Basic cell information + + common_plot_args <- list( + violin_alpha = 0.5, + box_alpha = 0.3, + box_width = 0.05, + violin_outline_fill = FALSE, + box_outline_default = "black", + adjust = 1.5 + ) + + # 1. Number of Reads Across Cells + cfg_reads <- list( + column = count_col, + name = "gg_reads_in_cells", + title = paste("Number of", entity_label_plural, "Across Cells"), + fill = "#CC6633", + y_label = paste(entity_label_plural, ", count", sep = ""), + x_label = "Cells", + plot_args = common_plot_args + ) + single_violin(SQANTI_cell_summary, cfg_reads) + + # 2. Number of UMIs Across Cells (only if not isoforms mode) + if (mode != "isoforms") { + cfg_umis <- list( + column = "UMIs_in_cell", + name = "gg_umis_in_cells", + title = "Number of UMIs Across Cells", + fill = "#CC6633", + y_label = "UMIs, count", + x_label = "Cells", + plot_args = common_plot_args + ) + single_violin(SQANTI_cell_summary, cfg_umis) + } + + # 3. Number of Genes Across Cells + cfg_genes <- list( + column = "Genes_in_cell", + name = "gg_genes_in_cells", + title = "Number of Genes Across Cells", + fill = "#CC6633", + y_label = "Genes, count", + x_label = "Cells", + plot_args = common_plot_args + ) + single_violin(SQANTI_cell_summary, cfg_genes) + + # 4. Number of Unique Junction Chains Across Cells + if (mode != "isoforms" && "UJCs_in_cell" %in% names(SQANTI_cell_summary) && !all(is.na(SQANTI_cell_summary$UJCs_in_cell)) && max(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE) > 0) { + cfg_ujcs <- list( + column = "UJCs_in_cell", + name = "gg_JCs_in_cell", + title = "Number of Unique Junction\nChains Across Cells", + fill = "#CC6633", + y_label = "UJCs, count", + x_label = "Cells", + plot_args = common_plot_args + ) + single_violin(SQANTI_cell_summary, cfg_ujcs) + } + + pivot_defaults <- list( + violin_alpha = 0.5, + box_alpha = 0.3, + box_width = 0.05, + violin_outline_fill = FALSE, + box_outline_default = "black" + ) + pivot_violin(SQANTI_cell_summary, list( + name = "gg_annotation_of_genes_in_cell", + columns = c("Annotated_genes", "Novel_genes"), + title = "Number of Known/Novel Genes Across Cells", + x_labels = c("Annotated Genes", "Novel Genes"), + y_label = paste(entity_label_plural, ", counts", sep = ""), + fill_map = c("Annotated_genes" = fill_color_orange, "Novel_genes" = fill_color_orange), + plot_args = pivot_defaults + )) + + if ("Genes_in_cell" %in% colnames(SQANTI_cell_summary)) { + SQANTI_cell_summary$Annotated_genes_perc <- ifelse( + SQANTI_cell_summary$Genes_in_cell > 0, + 100 * SQANTI_cell_summary$Annotated_genes / SQANTI_cell_summary$Genes_in_cell, + 0 + ) + SQANTI_cell_summary$Novel_genes_perc <- ifelse( + SQANTI_cell_summary$Genes_in_cell > 0, + 100 * SQANTI_cell_summary$Novel_genes / SQANTI_cell_summary$Genes_in_cell, + 0 + ) + + pivot_violin(SQANTI_cell_summary, list( + name = "gg_annotation_of_genes_percent_in_cell", + columns = c("Annotated_genes_perc", "Novel_genes_perc"), + title = "Percentage of Known/Novel Genes Across Cells", + x_labels = c("Annotated Genes", "Novel Genes"), + y_label = "Genes, %", + fill_map = c("Annotated_genes_perc" = fill_color_orange, "Novel_genes_perc" = fill_color_orange), + plot_args = pivot_defaults + )) + } + + # 5. Percentage of Reads/Transcripts from Known/Novel Genes Across Cells + # (Enabled for both reads and isoforms modes) + { + classification_valid <- Classification_file[Classification_file$CB != "unassigned" & !is.na(Classification_file$CB), ] + + if (nrow(classification_valid) > 0) { + # Function to expand FL and CB columns into a long format for correct counting per cell + expand_isoform_counts <- function(df, mode) { + if (mode == "reads") { + return(df %>% group_by(CB) %>% summarise(count = n(), .groups = "drop")) + } else { + # Isoforms mode: Each row has comma-separated FL (counts) and CB (barcodes) + # We need to split them and sum counts per barcode + + # Initialize lists to store expanded data + all_cbs <- character() + all_counts <- numeric() + + # Iterate through rows (this might be slow for huge files, but safe) + # A vectorised approach would be better if possible, but strsplit returns list + fl_list <- strsplit(as.character(df$FL), ",") + cb_list <- strsplit(as.character(df$CB), ",") + + # Check if lengths match (they should) + if (length(fl_list) != length(cb_list)) { + stop("Mismatch in row counts between FL and CB columns") + } + + # Use mapply to create a data frame of all counts + # This creates a list of data frames, one per isoform + expanded_list <- mapply(function(fl, cb) { + if (length(fl) != length(cb)) { + # Warning or skip? For now, we assume they match as per SQANTI specs + return(NULL) + } + data.frame(CB = cb, count = as.numeric(fl), stringsAsFactors = FALSE) + }, fl_list, cb_list, SIMPLIFY = FALSE) + + # Bind all tiny data frames + long_df <- do.call(rbind, expanded_list) + + # Now group by CB and sum + return(long_df %>% group_by(CB) %>% summarise(count = sum(count, na.rm = TRUE), .groups = "drop")) + } + } + + annotated_reads_per_cell <- classification_valid %>% + filter(!grepl("^novel", associated_gene)) + + annotated_reads_per_cell <- expand_isoform_counts(annotated_reads_per_cell, mode) %>% + rename(Annotated_genes_reads = count) + + novel_reads_per_cell <- classification_valid %>% + filter(grepl("^novel", associated_gene)) + + novel_reads_per_cell <- expand_isoform_counts(novel_reads_per_cell, mode) %>% + rename(Novel_genes_reads = count) + + SQANTI_cell_summary <- SQANTI_cell_summary %>% + left_join(annotated_reads_per_cell, by = "CB") %>% + left_join(novel_reads_per_cell, by = "CB") + + SQANTI_cell_summary$Annotated_genes_reads[is.na(SQANTI_cell_summary$Annotated_genes_reads)] <- 0 + SQANTI_cell_summary$Novel_genes_reads[is.na(SQANTI_cell_summary$Novel_genes_reads)] <- 0 + + # Revert to original denominator (Total Transcripts in Cell) now that numerators are correct + SQANTI_cell_summary$Annotated_reads_perc <- 100 * SQANTI_cell_summary$Annotated_genes_reads / SQANTI_cell_summary[[count_col]] + SQANTI_cell_summary$Novel_reads_perc <- 100 * SQANTI_cell_summary$Novel_genes_reads / SQANTI_cell_summary[[count_col]] + + SQANTI_cell_summary$Annotated_reads_perc <- ifelse(is.na(SQANTI_cell_summary$Annotated_reads_perc) | is.infinite(SQANTI_cell_summary$Annotated_reads_perc), 0, SQANTI_cell_summary$Annotated_reads_perc) + SQANTI_cell_summary$Novel_reads_perc <- ifelse(is.na(SQANTI_cell_summary$Novel_reads_perc) | is.infinite(SQANTI_cell_summary$Novel_reads_perc), 0, SQANTI_cell_summary$Novel_reads_perc) + + pivot_violin(SQANTI_cell_summary, list( + name = "gg_annotation_of_reads_in_cell", + columns = c("Annotated_reads_perc", "Novel_reads_perc"), + title = paste("Percentage of", entity_label_plural, "from Known/Novel Genes Across Cells"), + x_labels = c("Annotated Genes", "Novel Genes"), + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = c("Annotated_reads_perc" = fill_color_orange, "Novel_reads_perc" = fill_color_orange), + plot_args = pivot_defaults + )) + } else { + message("Warning: No valid classification data found. Skipping read expression by gene annotation plot.") + gg_annotation_of_reads_in_cell <<- ggplot() + + labs(title = "Plot not available") + + theme_minimal() + layout( + title = paste("Percentage of", entity_label_plural, "from Known/Novel Genes Across Cells"), + annotations = list( + text = paste(entity_label, "expression by gene annotation\nnot available"), + showarrow = FALSE, + font = list(size = 16, color = "gray") + ) + ) + } + } + + single_violin(SQANTI_cell_summary, list( + name = "gg_MT_perc", + column = "MT_perc", + title = paste("Mitochondrial", entity_label_plural, "Across Cells"), + x_labels = c("Cell"), + y_label = paste(entity_label_plural, ", %", sep = ""), + fill = "#CC6633", + plot_args = common_plot_args + )) + + ### Gene Distribution by Read Count Bins (configurable gene bins) ### + #################################################################### + + # Define gene read-count bins and labels + gene_bin_label <- function(n) { + if (is.na(n)) { + return(NA_character_) + } + if (n == 1) { + return("1") + } + if (n >= 2 && n <= 5) { + return("2-5") + } + if (n >= 6 && n <= 9) { + return("6-9") + } + return(">=10") + } + gene_bin_levels <- c("1", "2-5", "6-9", ">=10") + + # Build per-cell per-gene read counts from classification. + # In isoforms mode, must explode the comma-separated CB/FL columns so each + # (cell, isoform) pair is weighted by its FL count, then sum per (CB, gene). + # In reads mode, each row is one read so n() is correct. + if (mode == "isoforms" && "FL" %in% colnames(Classification_file) && "CB" %in% colnames(Classification_file)) { + genes_by_cb_base <- Classification_file %>% + filter(!is.na(CB), CB != "unassigned", !is.na(associated_gene)) %>% + select(CB, FL, associated_gene) + + genes_by_cb_base$CB_raw <- as.character(genes_by_cb_base$CB) + genes_by_cb_base$FL_raw <- as.character(genes_by_cb_base$FL) + genes_by_cb_base <- tidyr::separate_rows(genes_by_cb_base, CB_raw, FL_raw, sep = ",") + genes_by_cb_base$FL_num <- suppressWarnings(as.numeric(trimws(genes_by_cb_base$FL_raw))) + genes_by_cb_base$FL_num[is.na(genes_by_cb_base$FL_num) | genes_by_cb_base$FL_num < 0] <- 0 + genes_by_cb_base$CB_clean <- trimws(genes_by_cb_base$CB_raw) + + genes_by_cb <- genes_by_cb_base %>% + filter(CB_clean != "" & CB_clean != "unassigned" & FL_num > 0) %>% + group_by(CB = CB_clean, associated_gene) %>% + summarise(reads_per_gene = sum(FL_num), .groups = "drop") %>% + mutate( + gene_type = ifelse(grepl("^novel", associated_gene), "Novel", "Annotated"), + bin = vapply(reads_per_gene, gene_bin_label, character(1)) + ) %>% + filter(!is.na(bin)) + } else { + genes_by_cb <- Classification_file %>% + filter(!is.na(CB), CB != "unassigned", !is.na(associated_gene)) %>% + group_by(CB, associated_gene) %>% + summarise(reads_per_gene = n(), .groups = "drop") %>% + mutate( + gene_type = ifelse(grepl("^novel", associated_gene), "Novel", "Annotated"), + bin = vapply(reads_per_gene, gene_bin_label, character(1)) + ) %>% + filter(!is.na(bin)) + } + + # Percent of genes per bin within each CB and gene type + read_bins_data <- genes_by_cb %>% + group_by(CB, gene_type, bin) %>% + summarise(num_genes = n(), .groups = "drop") %>% + group_by(CB, gene_type) %>% + mutate(percentage = 100 * num_genes / sum(num_genes)) %>% + ungroup() %>% + tidyr::complete(CB, gene_type, bin = gene_bin_levels, fill = list(num_genes = 0, percentage = 0)) + + read_bins_data$bin <- factor(read_bins_data$bin, levels = gene_bin_levels) + read_bins_data$gene_type <- factor(read_bins_data$gene_type, levels = c("Annotated", "Novel")) + + if (mode == "isoforms") { + gg_read_bins <<- build_grouped_violin_plot( + df = read_bins_data %>% transmute(bin = as.character(bin), group = as.character(gene_type), value = percentage), + bin_levels = gene_bin_levels, + group_levels = c("Annotated", "Novel"), + title = paste("Distribution of Known/Novel Genes by", entity_label, "Count Bins Across Cells"), + fill_map = c("Annotated" = "#e37744", "Novel" = "#78C679"), + legend_labels = c("Annotated" = "Annotated", "Novel" = "Novel"), + y_label = "Genes, %", + ylim = c(0, 100), + violin_alpha = 0.5, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_width = 0.28, + dodge_width = 1.0 + ) + } + + # Combined (all genes together): one violin per bin + if (mode == "reads") { + # Filter for Annotated genes only + read_bins_all <- genes_by_cb %>% + filter(gene_type == "Annotated") %>% + group_by(CB, bin) %>% + summarise(num_genes = n(), .groups = "drop") %>% + group_by(CB) %>% + mutate(percentage = 100 * num_genes / sum(num_genes)) %>% + ungroup() %>% + tidyr::complete(CB, bin = gene_bin_levels, fill = list(num_genes = 0, percentage = 0)) + + plot_title_all <- paste("Distribution of Annotated Genes by", entity_label, "Count Bins Across Cells") + } else { + # All genes (Annotated + Novel) + read_bins_all <- genes_by_cb %>% + group_by(CB, bin) %>% + summarise(num_genes = n(), .groups = "drop") %>% + group_by(CB) %>% + mutate(percentage = 100 * num_genes / sum(num_genes)) %>% + ungroup() %>% + tidyr::complete(CB, bin = gene_bin_levels, fill = list(num_genes = 0, percentage = 0)) + + plot_title_all <- paste("Distribution of Genes by", entity_label, "Count Bins Across Cells") + } + + read_bins_all$bin <- factor(read_bins_all$bin, levels = gene_bin_levels) + + { + df_long <- data.frame( + Variable = factor(read_bins_all$bin, levels = gene_bin_levels), + Value = read_bins_all$percentage + ) + fill_map <- setNames(rep("#CC6633", length(gene_bin_levels)), gene_bin_levels) + gg_read_bins_all <<- build_violin_plot( + df_long, + title = plot_title_all, + x_labels = as.character(gene_bin_levels), + fill_map = fill_map, + y_label = "Genes, %", + legend = FALSE, + ylim = c(0, 100), + violin_alpha = 0.5, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + box_outline_default = "black", + violin_outline_fill = FALSE + ) + } + + # New plot: Distribution of Known Genes by Unique Isoform Count Bins Across Cells (Isoforms mode) + if (mode == "isoforms") { + iso_bins_annot <- genes_by_cb %>% + filter(gene_type == "Annotated") %>% + group_by(CB, bin) %>% + summarise(num_genes = n(), .groups = "drop") %>% + group_by(CB) %>% + mutate(percentage = 100 * num_genes / sum(num_genes)) %>% + ungroup() %>% + tidyr::complete(CB, bin = gene_bin_levels, fill = list(num_genes = 0, percentage = 0)) + + iso_bins_annot$bin <- factor(iso_bins_annot$bin, levels = gene_bin_levels) + + df_long_iso <- data.frame( + Variable = factor(iso_bins_annot$bin, levels = gene_bin_levels), + Value = iso_bins_annot$percentage + ) + # Use Annotated color #e37744 + fill_map_iso <- setNames(rep("#e37744", length(gene_bin_levels)), gene_bin_levels) + + gg_isoform_bins <<- build_violin_plot( + df_long_iso, + title = "Distribution of Known Genes by Unique Isoform Count Bins Across Cells", + x_labels = as.character(gene_bin_levels), + fill_map = fill_map_iso, + y_label = "Genes, %", + legend = FALSE, + ylim = c(0, 100), + violin_alpha = 0.5, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + box_outline_default = "black", + violin_outline_fill = FALSE + ) + } + + # UJC bins (combined) using jxn strings per gene per CB + if (mode != "isoforms") { + ujc_bin_label <- function(n) { + if (is.na(n)) { + return(NA_character_) + } + if (n == 1) { + return("1") + } + if (n >= 2 && n <= 3) { + return("2-3") + } + if (n >= 4 && n <= 5) { + return("4-5") + } + return(">=6") + } + ujc_bin_levels <- c("1", "2-3", "4-5", ">=6") + + # Check if jxn_string exists (it won't if --skip_hash was used) + if ("jxn_string" %in% colnames(Classification_file)) { + ujc_by_cb <- Classification_file %>% + filter(!is.na(CB), CB != "unassigned", !is.na(associated_gene), exons > 1) %>% + group_by(CB, associated_gene) %>% + summarise(ujc_per_gene = dplyr::n_distinct(jxn_string), .groups = "drop") %>% + mutate(bin = vapply(ujc_per_gene, ujc_bin_label, character(1))) %>% + filter(!is.na(bin)) + } else { + ujc_by_cb <- data.frame() + } + + if (nrow(ujc_by_cb) > 0) { + # For reads mode, filter for Annotated genes only + if (mode == "reads") { + ujc_by_cb <- ujc_by_cb %>% + mutate(gene_type = ifelse(grepl("^novel", associated_gene), "Novel", "Annotated")) %>% + filter(gene_type == "Annotated") + + plot_title_ujc_all <- "Distribution of Annotated Genes by UJC Count Bins Across Cells" + } else { + plot_title_ujc_all <- "Distribution of Genes by UJC Count Bins Across Cells" + } + + ujc_bins_all <- ujc_by_cb %>% + group_by(CB, bin) %>% + summarise(num_genes = n(), .groups = "drop") %>% + group_by(CB) %>% + mutate(percentage = 100 * num_genes / sum(num_genes)) %>% + ungroup() %>% + tidyr::complete(CB, bin = ujc_bin_levels, fill = list(num_genes = 0, percentage = 0)) + + ujc_bins_all$bin <- factor(ujc_bins_all$bin, levels = ujc_bin_levels) + + { + df_long <- data.frame( + Variable = factor(ujc_bins_all$bin, levels = ujc_bin_levels), + Value = ujc_bins_all$percentage + ) + } + fill_map <- setNames(rep("#CC6633", length(ujc_bin_levels)), ujc_bin_levels) + gg_ujc_bins_all <<- build_violin_plot( + df_long, + title = plot_title_ujc_all, + x_labels = as.character(ujc_bin_levels), + fill_map = fill_map, + y_label = "Genes, %", + legend = FALSE, + ylim = c(0, 100), + violin_alpha = 0.5, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + box_outline_default = "black", + violin_outline_fill = FALSE + ) + } + + # Create UJC bins data + ujc_bins_data <- data.frame( + CB = rep(SQANTI_cell_summary$CB, 8), + bin = rep(c("1", "2-3", "4-5", ">=6", "1", "2-3", "4-5", ">=6"), each = nrow(SQANTI_cell_summary)), + gene_type = rep(c("Annotated", "Annotated", "Annotated", "Annotated", "Novel", "Novel", "Novel", "Novel"), each = nrow(SQANTI_cell_summary)), + percentage = c( + SQANTI_cell_summary$anno_ujc_bin1_perc, + SQANTI_cell_summary$anno_ujc_bin2_3_perc, + SQANTI_cell_summary$anno_ujc_bin4_5_perc, + SQANTI_cell_summary$anno_ujc_bin6plus_perc, + SQANTI_cell_summary$novel_ujc_bin1_perc, + SQANTI_cell_summary$novel_ujc_bin2_3_perc, + SQANTI_cell_summary$novel_ujc_bin4_5_perc, + SQANTI_cell_summary$novel_ujc_bin6plus_perc + ) + ) + + # Handle NA and invalid values + ujc_bins_data <- ujc_bins_data %>% + mutate(percentage = ifelse(is.na(percentage) | is.infinite(percentage) | percentage < 0, 0, percentage)) + + ujc_bins_data$bin <- factor(ujc_bins_data$bin, levels = c("1", "2-3", "4-5", ">=6")) + ujc_bins_data$gene_type <- factor(ujc_bins_data$gene_type, levels = c("Annotated", "Novel")) + + # Only generate split plot if NOT in reads mode + if (mode != "reads") { + gg_ujc_bins <<- build_grouped_violin_plot( + df = ujc_bins_data %>% transmute(bin = as.character(bin), group = as.character(gene_type), value = percentage), + bin_levels = ujc_bin_levels, + group_levels = c("Annotated", "Novel"), + title = "Distribution of Known/Novel Genes by UJC Count Bins Across Cells", + fill_map = c("Annotated" = "#e37744", "Novel" = "#78C679"), + legend_labels = c("Annotated" = "Annotated", "Novel" = "Novel"), + y_label = "Genes, %", + ylim = c(0, 100), + violin_alpha = 0.5, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_width = 0.28, + dodge_width = 1.0 + ) + } + } + + # Mitochondrial percentage in cell + { + df_long <- data.frame(Variable = "MT_perc", Value = SQANTI_cell_summary$MT_perc) + df_long$Variable <- factor(df_long$Variable, levels = "MT_perc") + fill_map <- c("MT_perc" = fill_color_orange) + x_labels <- c("Cell") + gg_MT_perc <<- build_violin_plot( + df_long, + title = paste("Mitochondrial", entity_label_plural, "\nAcross Cells"), + x_labels = x_labels, + fill_map = fill_map, + y_label = paste(entity_label_plural, ", %", sep = ""), + legend = FALSE, + violin_alpha = 0.5, + box_alpha = 0.3, + box_width = 0.05, + box_outline_default = "black", + violin_outline_fill = FALSE, + x_tickangle = 45 + ) + } + + # Mono/multi-exon prop novel vs annotated genes + + ### Length distribution ### + ########################### + + # Compact helpers for repeated per-category and length plots + cat_tags <- c("FSM", "ISM", "NIC", "NNC", "Genic", "Antisense", "Fusion", "Intergenic", "Genic_intron") + cat_labels_pretty <- c("FSM", "ISM", "NIC", "NNC", "Genic\nGenomic", "Antisense", "Fusion", "Intergenic", "Genic\nIntron") + cat_fill_map <- c(FSM = "#6BAED6", ISM = "#FC8D59", NIC = "#78C679", NNC = "#EE6A50", Genic = "#969696", Antisense = "#66C2A4", Fusion = "goldenrod1", Intergenic = "darksalmon", Genic_intron = "#41B6C4") + structural_category_map <- c( + "full-splice_match" = "FSM", + "incomplete-splice_match" = "ISM", + "novel_in_catalog" = "NIC", + "novel_not_in_catalog" = "NNC", + "genic" = "Genic", + "antisense" = "Antisense", + "fusion" = "Fusion", + "intergenic" = "Intergenic", + "genic_intron" = "Genic_intron" + ) + structural_category_levels <- unname(structural_category_map) + # Build violin across categories and assign to a global name + # Helper: build 9 tag column names from suffix (e.g. "_intrapriming_prop") + cat_cols <- function(suffix) paste0(cat_tags, suffix) + # Length plot generator and variable name mapping + cat_var_base <- c(FSM = "FSM", ISM = "ISM", NIC = "NIC", NNC = "NNC", Genic = "genic", Antisense = "antisense", Fusion = "fusion", Intergenic = "intergenic", Genic_intron = "genic_intron") + make_len_plot <- function(prefix, pretty, color, mono = FALSE) { + var_nm <- if (mono) paste0("gg_", cat_var_base[[prefix]], "_mono_read_distr") else paste0("gg_", cat_var_base[[prefix]], "_read_distr") + title_txt <- if (mono) paste0(pretty, " Mono-exonic Read Lengths Distribution Across Cells") else paste0(pretty, " Reads Length Distribution Across Cells") + assign(var_nm, build_len_violin_for_prefix( + SQANTI_cell_summary, + prefix = prefix, + title = title_txt, + fill_color = color, + box_fill = color, + mono = mono, + violin_alpha = 0.7, + box_alpha = 0.3, + box_outline_color = if (prefix %in% c("Genic")) "grey90" else "grey20", + violin_outline_fill = TRUE + ), envir = .GlobalEnv) + } + + # Bulk distributions + gg_bulk_all_reads <<- ggplot(Classification_file, aes(x = length)) + + geom_histogram(binwidth = 50, fill = "#CC6633", color = "black", alpha = 0.5) + + labs( + title = paste("All", entity_label, "Lengths Distribution"), + x = paste(entity_label, "length"), + y = paste(entity_label_plural, ", counts", sep = "") + ) + + theme_classic() + + theme( + legend.position = "none", + plot.title = element_text(size = 14, face = "bold", hjust = 0.5), + plot.margin = margin(t = 40, r = 5, b = 5, l = 5, unit = "pt"), + axis.title = element_text(size = 16), + axis.text.y = element_text(size = 14), + axis.text.x = element_text(size = 16) + ) + + # Bulk read length distribution by structural category + Classification_file$structural_category <- factor( + Classification_file$structural_category, + levels = c( + "full-splice_match", + "incomplete-splice_match", + "novel_in_catalog", + "novel_not_in_catalog", + "genic", + "antisense", + "fusion", + "intergenic", + "genic_intron" + ) + ) + + structural_category_labels <- c( + "full-splice_match" = "FSM", + "incomplete-splice_match" = "ISM", + "novel_in_catalog" = "NIC", + "novel_not_in_catalog" = "NNC", + "genic" = "Genic Genomic", + "antisense" = "Antisense", + "fusion" = "Fusion", + "intergenic" = "Intergenic", + "genic_intron" = "Genic Intron" + ) + structural_category_palette <- c( + "FSM" = "#6BAED6", + "ISM" = "#FC8D59", + "NIC" = "#78C679", + "NNC" = "#EE6A50", + "Genic Genomic" = "#969696", + "Antisense" = "#66C2A4", + "Fusion" = "goldenrod1", + "Intergenic" = "darksalmon", + "Genic Intron" = "#41B6C4" + ) + Classification_file$structural_category_pretty <- structural_category_labels[as.character(Classification_file$structural_category)] + Classification_file$structural_category_pretty <- factor( + Classification_file$structural_category_pretty, + levels = names(structural_category_palette) + ) + + gg_bulk_length_by_category <<- ggplot(Classification_file, aes(x = length, color = structural_category_pretty)) + + geom_freqpoly(binwidth = 100, linewidth = 1.2, na.rm = TRUE) + + labs( + title = paste("All", entity_label, "Lengths Distribution by Structural Category"), + x = paste(entity_label, "length"), + y = paste(entity_label_plural, ", counts", sep = ""), + color = NULL + ) + + theme_classic(base_size = 16) + + scale_color_manual(values = structural_category_palette, drop = FALSE) + + scale_x_continuous( + breaks = scales::pretty_breaks(n = 8), + labels = scales::comma + ) + + theme( + legend.position = "bottom", + legend.title = element_blank(), + legend.key.size = unit(0.8, "cm"), + legend.text = element_text(size = 12), + plot.title = element_text(size = 14, face = "bold", hjust = 0.5), + plot.margin = margin(t = 40, r = 5, b = 5, l = 5, unit = "pt"), + axis.title = element_text(size = 16), + axis.text.y = element_text(size = 14), + axis.text.x = element_text(size = 16) + ) + + guides(color = guide_legend(nrow = 2)) + + # Mono vs multi-exon classification for length + Classification_file$exons <- as.numeric(Classification_file$exons) + + gg_bulk_length_by_exon_type <<- ggplot( + Classification_file, + aes(x = length, color = ifelse(exons == 1, "Mono-Exon", "Multi-Exon")) + ) + + geom_freqpoly(binwidth = 100, linewidth = 1.2, na.rm = TRUE) + + labs( + title = paste("Mono- vs Multi- Exon", entity_label, "Lengths Distribution"), + x = paste(entity_label, "length"), + y = paste(entity_label_plural, ", counts", sep = ""), + color = NULL + ) + + theme_classic(base_size = 16) + + scale_color_manual( + values = c("Multi-Exon" = "#3B0057", "Mono-Exon" = "#FFE44C") + ) + + scale_x_continuous( + breaks = scales::pretty_breaks(n = 8), + labels = scales::comma + ) + + theme( + legend.position = "bottom", + legend.title = element_blank(), + legend.key.size = unit(1, "cm"), + legend.text = element_text(size = 14), + plot.title = element_text(size = 14, face = "bold", hjust = 0.5), + plot.margin = margin(t = 40, r = 5, b = 5, l = 5, unit = "pt"), + axis.title = element_text(size = 16), + axis.text.y = element_text(size = 14), + axis.text.x = element_text(size = 16) + ) + + # Cell-level length distributions (all + mono) + gg_read_distr <<- build_len_violin_for_prefix( + SQANTI_cell_summary, + prefix = "Total", + title = paste(entity_label_plural, "Length Distribution Across Cells"), + fill_color = "#CC6633", + box_fill = "#CC6633", + mono = FALSE, + violin_alpha = 0.7, + box_alpha = 0.6, + box_outline_color = "grey20", + violin_outline_fill = FALSE + ) + + # Mono-exon length distribution per break + gg_read_distr_mono <<- build_len_violin_for_prefix( + SQANTI_cell_summary, + prefix = "Total", + title = paste("Mono-exonic", entity_label_plural, "Length Distribution Across Cells"), + fill_color = "#CC6633", + box_fill = "#CC6633", + mono = TRUE, + violin_alpha = 0.7, + box_alpha = 0.6, + box_outline_color = "grey20", + violin_outline_fill = FALSE + ) + + # Per-category length distributions via loop + len_specs <- list( + list(tag = "FSM", pretty = "FSM", color = "#6BAED6"), + list(tag = "ISM", pretty = "ISM", color = "#FC8D59"), + list(tag = "NIC", pretty = "NIC", color = "#78C679"), + list(tag = "NNC", pretty = "NNC", color = "#EE6A50"), + list(tag = "Genic", pretty = "Genic", color = "#969696"), + list(tag = "Antisense", pretty = "Antisense", color = "#66C2A4"), + list(tag = "Fusion", pretty = "Fusion", color = "goldenrod1"), + list(tag = "Intergenic", pretty = "Intergenic", color = "darksalmon"), + list(tag = "Genic_intron", pretty = "Genic Intron", color = "#41B6C4") + ) + for (sp in len_specs) { + make_len_plot(sp$tag, sp$pretty, sp$color, mono = FALSE) + } + for (sp in len_specs) { + # Mono versions where meaningful (skip NNC and Fusion for PDF) + if (sp$tag %in% c("NNC", "Fusion")) next + make_len_plot(sp$tag, sp$pretty, sp$color, mono = TRUE) + } + + ### Reference coverage across categories ### + ############################################ + + { + # Only FSM and ISM have a meaningful ref_length association; other categories are excluded. + cols <- c("FSM_ref_coverage_prop", "ISM_ref_coverage_prop") + gg_SQANTI_pivot <- pivot_long(SQANTI_cell_summary, cols) + fill_map <- c("FSM_ref_coverage_prop" = "#6BAED6", "ISM_ref_coverage_prop" = "#FC8D59") + x_labels <- c("FSM", "ISM") + # Build dynamic title using cutoff from cell summary + ref_cov_min_pct <- if ("ref_cov_min_pct" %in% colnames(SQANTI_cell_summary)) { + vals <- unique(stats::na.omit(SQANTI_cell_summary$ref_cov_min_pct)) + if (length(vals) > 0) as.numeric(vals[1]) else NA_real_ + } else { + NA_real_ + } + pct_lbl <- if (is.finite(ref_cov_min_pct)) { + if (abs(ref_cov_min_pct - round(ref_cov_min_pct)) < 1e-6) sprintf("%.0f", ref_cov_min_pct) else sprintf("%.1f", ref_cov_min_pct) + } else { + NULL + } + title_txt <- if (!is.null(pct_lbl)) { + paste0(entity_label_plural, " with Coverage >=", pct_lbl, "% of the Reference Transcript Length\nby Structural Category Across Cells") + } else { + "Reference Transcript Length Coverage\nby Structural Category Across Cells" + } + gg_ref_coverage_across_category <<- build_violin_plot( + gg_SQANTI_pivot, + title = title_txt, + x_labels = x_labels, + fill_map = fill_map, + y_label = paste(entity_label_plural, ", %", sep = ""), + legend = FALSE, + violin_outline_fill = TRUE + ) + } + + # Meta-transcript body coverage profile (bulk, not per-cell) + gg_meta_transcript_coverage <<- build_meta_coverage_plot(Classification_file) + + # Reference Transcriptome vs Sample length distribution (both isoforms and reads modes) + gg_isoforms_ref_vs_sample_lengths <<- build_ref_vs_sample_lengths(Classification_file, ref_gtf_path, mode) + + ### Structural categories ### + + category_fill_map <- c( + "FSM_prop" = "#6BAED6", "ISM_prop" = "#FC8D59", "NIC_prop" = "#78C679", "NNC_prop" = "#EE6A50", + "Genic_Genomic_prop" = "#969696", "Antisense_prop" = "#66C2A4", "Fusion_prop" = "goldenrod1", + "Intergenic_prop" = "darksalmon", "Genic_intron_prop" = "#41B6C4" + ) + pivot_violin(SQANTI_cell_summary, list( + name = "gg_SQANTI_across_category", + columns = names(category_fill_map), + title = "Structural Categories Distribution Across Cells", + x_labels = cat_labels_pretty, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = category_fill_map, + plot_args = list(override_outline_vars = c("Genic_Genomic_prop"), violin_outline_fill = TRUE) + )) + + # Coding/non-coding across structural categories (change it in the future to a combine plot) + if (!skipORF) { + # Update to new column naming convention: {tag}_coding_prop + # Explicitly define columns to match cell_metrics.py output (lowercase for non-canonical categories) + coding_cols <- c( + "FSM_coding_prop", "ISM_coding_prop", "NIC_coding_prop", "NNC_coding_prop", + "genic_coding_prop", "antisense_coding_prop", "fusion_coding_prop", + "intergenic_coding_prop", "genic_intron_coding_prop" + ) + coding_fill_map <- c( + "FSM_coding_prop" = "#6BAED6", + "ISM_coding_prop" = "#FC8D59", + "NIC_coding_prop" = "#78C679", + "NNC_coding_prop" = "#EE6A50", + "genic_coding_prop" = "#969696", + "antisense_coding_prop" = "#66C2A4", + "fusion_coding_prop" = "goldenrod1", + "intergenic_coding_prop" = "darksalmon", + "genic_intron_coding_prop" = "#41B6C4" + ) + + pivot_violin(SQANTI_cell_summary, list( + name = "gg_coding_across_category", + columns = names(coding_fill_map), + title = "Coding Proportion of Structural Categories Distribution Across Cells", + x_labels = cat_labels_pretty, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = coding_fill_map, + plot_args = list(override_outline_vars = c("genic_coding_prop"), violin_outline_fill = TRUE) + )) + + # Define colors for non-coding (same as coding but will use alpha) + noncoding_fill_map <- c( + "FSM_non_coding_prop" = "#6BAED6", + "ISM_non_coding_prop" = "#FC8D59", + "NIC_non_coding_prop" = "#78C679", + "NNC_non_coding_prop" = "#EE6A50", + "genic_non_coding_prop" = "#969696", + "antisense_non_coding_prop" = "#66C2A4", + "fusion_non_coding_prop" = "goldenrod1", + "intergenic_non_coding_prop" = "darksalmon", + "genic_intron_non_coding_prop" = "#41B6C4" + ) + + pivot_violin(SQANTI_cell_summary, list( + name = "gg_non_coding_across_category", + columns = names(noncoding_fill_map), + title = "Non-coding Proportion of Structural Categories Distribution Across Cells", + x_labels = cat_labels_pretty, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = noncoding_fill_map, + plot_args = list( + override_outline_vars = c(), + violin_outline_fill = TRUE, + violin_alpha = 0.4, + box_alpha = 0.1 + ) + )) + } # End of if (!skipORF) + + subcategory_configs <- list( + list( + name = "gg_SQANTI_across_FSM", + columns = c( + "FSM_alternative_3end_prop", "FSM_alternative_3end5end_prop", "FSM_alternative_5end_prop", + "FSM_reference_match_prop", "FSM_mono_exon_prop" + ), + title = "FSM Structural Subcategories Distribution Across Cells", + x_labels = c("Alternative 3'end", "Alternative 3'5'end", "Alternative 5'end", "Reference match", "Mono-exon"), + fill_map = c( + "FSM_alternative_3end_prop" = "#02314d", "FSM_alternative_3end5end_prop" = "#0e5a87", + "FSM_alternative_5end_prop" = "#7ccdfc", "FSM_reference_match_prop" = "#c4e1f2", + "FSM_mono_exon_prop" = "#cec2d2" + ), + plot_args = list(override_outline_vars = c("FSM_alternative_3end_prop", "FSM_alternative_3end5end_prop"), violin_outline_fill = TRUE) + ), + list( + name = "gg_SQANTI_across_ISM", + columns = c( + "ISM_3prime_fragment_prop", "ISM_internal_fragment_prop", "ISM_5prime_fragment_prop", + "ISM_intron_retention_prop", "ISM_mono_exon_prop" + ), + title = "ISM Structural Subcategories Distribution Across Cells", + x_labels = c("3' fragment", "Internal fragment", "5' fragment", "Intron retention", "Mono-exon"), + fill_map = c( + "ISM_3prime_fragment_prop" = "#c4531d", "ISM_internal_fragment_prop" = "#e37744", + "ISM_5prime_fragment_prop" = "#e0936e", "ISM_intron_retention_prop" = "#81eb82", + "ISM_mono_exon_prop" = "#cec2d2" + ), + plot_args = list(override_outline_vars = c("ISM_3prime_fragment_prop", "ISM_internal_fragment_prop"), violin_outline_fill = TRUE) + ), + list( + name = "gg_SQANTI_across_NIC", + columns = c( + "NIC_combination_of_known_junctions_prop", "NIC_combination_of_known_splicesites_prop", + "NIC_intron_retention_prop", "NIC_mono_exon_by_intron_retention_prop", "NIC_mono_exon_prop" + ), + title = "NIC Structural Subcategories Distribution Across Cells", + x_labels = c("Comb. of annot. junctions", "Comb. of annot. splice sites", "Intron retention", "Mono-exon by intron ret.", "Mono-exon"), + fill_map = c( + "NIC_combination_of_known_junctions_prop" = "#014d02", "NIC_combination_of_known_splicesites_prop" = "#379637", + "NIC_intron_retention_prop" = "#81eb82", "NIC_mono_exon_by_intron_retention_prop" = "#4aaa72", + "NIC_mono_exon_prop" = "#cec2d2" + ), + plot_args = list( + override_outline_vars = c( + "NIC_combination_of_known_junctions_prop", "NIC_combination_of_known_splicesites_prop", + "NIC_mono_exon_by_intron_retention_prop", "NIC_mono_exon_prop" + ), + violin_outline_fill = TRUE + ) + ), + list( + name = "gg_SQANTI_across_NNC", + columns = c("NNC_at_least_one_novel_splicesite_prop", "NNC_intron_retention_prop"), + title = "NNC Structural Subcategories Distribution Across Cells", + x_labels = c("At least\n1 annot. don./accept.", "Intron retention"), + fill_map = c("NNC_at_least_one_novel_splicesite_prop" = "#32734d", "NNC_intron_retention_prop" = "#81eb82"), + plot_args = list(override_outline_vars = c("NNC_at_least_one_novel_splicesite_prop"), violin_outline_fill = TRUE) + ), + list( + name = "gg_SQANTI_across_Fusion", + columns = c("Fusion_intron_retention_prop", "Fusion_multi_exon_prop"), + title = "Fusion Structural Subcategories Distribution Across Cells", + x_labels = c("Intron retention", "Multi-exon"), + fill_map = c("Fusion_intron_retention_prop" = "#81eb82", "Fusion_multi_exon_prop" = "#876a91"), + plot_args = list(override_outline_vars = c("Fusion_multi_exon_prop"), violin_outline_fill = TRUE) + ), + list( + name = "gg_SQANTI_across_Genic", + columns = c("Genic_mono_exon_prop", "Genic_multi_exon_prop"), + title = "Genic Structural Subcategories Distribution Across Cells", + x_labels = c("Mono-exon", "Multi-exon"), + fill_map = c("Genic_mono_exon_prop" = "#81eb82", "Genic_multi_exon_prop" = "#876a91"), + plot_args = list(override_outline_vars = c("Genic_multi_exon_prop"), violin_outline_fill = TRUE) + ), + list( + name = "gg_SQANTI_across_Genic_Intron", + columns = c("Genic_intron_mono_exon_prop", "Genic_intron_multi_exon_prop"), + title = "Genic Intron Structural Subcategories Distribution Across Cells", + x_labels = c("Mono-exon", "Multi-exon"), + fill_map = c("Genic_intron_mono_exon_prop" = "#81eb82", "Genic_intron_multi_exon_prop" = "#876a91"), + plot_args = list(override_outline_vars = c("Genic_intron_multi_exon_prop"), violin_outline_fill = TRUE) + ), + list( + name = "gg_SQANTI_across_Antisense", + columns = c("Antisense_mono_exon_prop", "Antisense_multi_exon_prop"), + title = "Antisense Structural Subcategories Distribution Across Cells", + x_labels = c("Mono-exon", "Multi-exon"), + fill_map = c("Antisense_mono_exon_prop" = "#81eb82", "Antisense_multi_exon_prop" = "#876a91"), + plot_args = list(override_outline_vars = c("Antisense_multi_exon_prop"), violin_outline_fill = TRUE) + ), + list( + name = "gg_SQANTI_across_Intergenic", + columns = c("Intergenic_mono_exon_prop", "Intergenic_multi_exon_prop"), + title = "Intergenic Structural Subcategories Distribution Across Cells", + x_labels = c("Mono-exon", "Multi-exon"), + fill_map = c("Intergenic_mono_exon_prop" = "#81eb82", "Intergenic_multi_exon_prop" = "#876a91"), + plot_args = list(override_outline_vars = c("Intergenic_multi_exon_prop"), violin_outline_fill = TRUE) + ) + ) + invisible(lapply(subcategory_configs, function(cfg) pivot_violin(SQANTI_cell_summary, cfg))) + + ### Splice junctions characterization ### + ######################################### + + pivot_violin(SQANTI_cell_summary, list( + name = "gg_known_novel_canon", + columns = c("Known_canonical_junctions_prop", "Known_non_canonical_junctions_prop", "Novel_canonical_junctions_prop", "Novel_non_canonical_junctions_prop"), + title = "Splice Junctions Distribution Across Cells", + x_labels = c("Known\nCanonical", "Known\nNon-canonical", "Novel\nCanonical", "Novel\nNon-canonical"), + y_label = "Junctions, %", + fill_map = c( + "Known_canonical_junctions_prop" = "#6BAED6", + "Known_non_canonical_junctions_prop" = "goldenrod1", + "Novel_canonical_junctions_prop" = "#78C679", + "Novel_non_canonical_junctions_prop" = "#FC8D59" + ), + plot_args = list(violin_outline_fill = TRUE) + )) + ### Good features plots (SR & TSS Validation) ### + ################################################# + + # 1. Combined Good Features Plot (All Transcripts) + all_good_features_map <- list( + "srjunctions_support_prop" = list(label = "SJs Validated by SRs", color = "#cd4f39"), + "TSS_ratio_validated_prop" = list(label = "TSS Validated by SRs", color = "#FFC125") + # Add other good features here if needed (e.g. polyA_motif_found_prop if available) + ) + + # Determine which good feature columns are present + good_feature_cols_present <- intersect(names(all_good_features_map), colnames(SQANTI_cell_summary)) + good_feature_cols_present <- good_feature_cols_present[sapply(good_feature_cols_present, function(col) any(!is.na(SQANTI_cell_summary[[col]])) && sum(SQANTI_cell_summary[[col]], na.rm = TRUE) > 0)] + + if (length(good_feature_cols_present) > 0) { + current_good_colors <- sapply(all_good_features_map[good_feature_cols_present], function(x) x$color) + current_good_labels <- sapply(all_good_features_map[good_feature_cols_present], function(x) x$label) + names(current_good_colors) <- good_feature_cols_present + names(current_good_labels) <- good_feature_cols_present + + pivot_violin(SQANTI_cell_summary, list( + name = "gg_good_feature", + columns = good_feature_cols_present, + title = "Validation Features Distribution Across Cells", + x_labels = current_good_labels, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = current_good_colors, + plot_args = list(violin_outline_fill = TRUE) + )) + } + + # 2. Per-Category Plots + # Short Read (SJs) Support + sr_cat_cols <- cat_cols("_srjunctions_support_prop") + if (all(sr_cat_cols %in% colnames(SQANTI_cell_summary)) && any(colSums(SQANTI_cell_summary[, sr_cat_cols, drop = FALSE], na.rm = TRUE) > 0)) { + pivot_violin(SQANTI_cell_summary, list( + name = "gg_sr_support_by_category", + columns = sr_cat_cols, + title = "SJs Validated by Short Reads by Structural Category", + x_labels = cat_labels_pretty, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = setNames(rep("#cd4f39", length(sr_cat_cols)), sr_cat_cols), + plot_args = list(violin_outline_fill = TRUE) + )) + } + + # TSS Validation Support + tss_cat_cols <- cat_cols("_TSS_ratio_validated_prop") + if (all(tss_cat_cols %in% colnames(SQANTI_cell_summary)) && any(colSums(SQANTI_cell_summary[, tss_cat_cols, drop = FALSE], na.rm = TRUE) > 0)) { + pivot_violin(SQANTI_cell_summary, list( + name = "gg_tss_validation_by_category", + columns = tss_cat_cols, + title = "TSS Validated by Short Reads by Structural Category", + x_labels = cat_labels_pretty, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = setNames(rep("#FFC125", length(tss_cat_cols)), tss_cat_cols), + plot_args = list(violin_outline_fill = TRUE) + )) + } + ### Bad features plots ### + ########################## + + bad_specs <- list( + list(suffix = "_intrapriming_prop", title = "Intrapriming by Structural Category", color = "#78C679", name = "gg_intrapriming_by_category"), + list(suffix = "_RTS_prop", title = "RT-switching by Structural Category", color = "#FF9933", name = "gg_RTS_by_category"), + list(suffix = "_noncanon_prop", title = "Non-Canonical Junctions by Structural Category", color = "#41B6C4", name = "gg_noncanon_by_category") + ) + invisible(lapply(bad_specs, function(sp) { + cols <- cat_cols(sp$suffix) + pivot_violin(SQANTI_cell_summary, list( + name = sp$name, + columns = cols, + title = sp$title, + x_labels = cat_labels_pretty, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = setNames(rep(sp$color, length(cols)), cols), + plot_args = list(violin_outline_fill = TRUE) + )) + })) + + # NMD (split between categories) + nmd_cols <- c("FSM_NMD_prop", "ISM_NMD_prop", "NIC_NMD_prop", "NNC_NMD_prop", "Genic_NMD_prop", "Antisense_NMD_prop", "Fusion_NMD_prop", "Intergenic_NMD_prop", "Genic_intron_NMD_prop") + if (all(nmd_cols %in% colnames(SQANTI_cell_summary))) { + pivot_violin(SQANTI_cell_summary, list( + name = "gg_NMD_by_category", + columns = nmd_cols, + title = "Nonsense-Mediated Decay by Structural Category", + x_labels = cat_labels_pretty, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = setNames(rep("#969696", length(nmd_cols)), nmd_cols), + plot_args = list(violin_outline_fill = TRUE) + )) + } + + ## Bad quality features combined figure + # Define all possible features, their colors, and labels + all_bad_features_map <- list( + "Intrapriming_prop_in_cell" = list(label = "Intrapriming", color = "#78C679"), + "RTS_prop_in_cell" = list(label = "RT-switching", color = "#FF9933"), + "Non_canonical_prop_in_cell" = list(label = "Non-Canonical Junctions", color = "#41B6C4"), + "NMD_prop_in_cell" = list(label = "Predicted NMD", color = "#969696") + ) + + # Determine which bad feature columns are actually present in SQANTI_cell_summary + # This implicitly handles skipORF, as NMD_prop_in_cell won't be in SQANTI_cell_summary if skipORF is TRUE + bad_feature_cols_present <- intersect(names(all_bad_features_map), colnames(SQANTI_cell_summary)) + bad_feature_cols_present <- bad_feature_cols_present[sapply(bad_feature_cols_present, function(col) any(!is.na(SQANTI_cell_summary[[col]])) && sum(SQANTI_cell_summary[[col]], na.rm = TRUE) > 0)] # keep only if data exists + + # Order them as originally intended, if present + ordered_bad_feature_cols <- c("Intrapriming_prop_in_cell", "RTS_prop_in_cell", "Non_canonical_prop_in_cell", "NMD_prop_in_cell") + bad_feature_cols_present <- intersect(ordered_bad_feature_cols, bad_feature_cols_present) + + if (length(bad_feature_cols_present) > 0) { + current_colors <- sapply(all_bad_features_map[bad_feature_cols_present], function(x) x$color) + current_labels <- sapply(all_bad_features_map[bad_feature_cols_present], function(x) x$label) + # Ensure names are correctly assigned for scales, matching the order in bad_feature_cols_present + names(current_colors) <- bad_feature_cols_present + names(current_labels) <- bad_feature_cols_present + + pivot_violin(SQANTI_cell_summary, list( + name = "gg_bad_feature", + columns = bad_feature_cols_present, + title = "Bad Quality Control Attributes Across Cells", + x_labels = current_labels, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = current_colors, + plot_args = list(violin_outline_fill = TRUE) + )) + } else { + gg_bad_feature <<- ggplot() + + labs(title = "Plot not available") + + theme_minimal() + layout( + title = "No bad quality features to display", + annotations = list( + text = "No bad quality features to display", + showarrow = FALSE, + font = list(size = 18, color = "gray") + ) + ) + } + + # Good features plots + + good_specs <- list( + list(cols = cat_cols("_TSSAnnotationSupport"), title = "TSS Annotation Support by Structural Category", color = "#66C2A4", name = "gg_tss_annotation_support", require_all = TRUE), + list(cols = cat_cols("_CAGE_peak_support_prop"), title = "CAGE Peak Support by Structural Category", color = "#EE6A50", name = "gg_cage_peak_support", require_all = TRUE), + list(cols = cat_cols("_PolyA_motif_support_prop"), title = "PolyA Support by Structural Category", color = "#78C679", name = "gg_polyA_motif_support", require_all = TRUE), + list(cols = cat_cols("_canon_prop"), title = "Canonical Junctions by Structural Category", color = "#CC6633", name = "gg_canon_by_category", require_all = TRUE), + list(cols = cat_cols("_srjunctions_support_prop"), title = "Splice Junctions Support by Structural Category", color = "#cd4f39", name = "gg_sr_support_by_category", require_all = TRUE), + list(cols = cat_cols("_TSS_ratio_validated_prop"), title = "TSS Support by Structural Category", color = "#ffc125", name = "gg_tss_validation_by_category", require_all = TRUE) + ) + invisible(lapply(good_specs, function(sp) { + if (!is.null(sp$require_all) && sp$require_all && !all(sp$cols %in% colnames(SQANTI_cell_summary))) { + return(NULL) + } + # Check if data is not empty (all zeros) + if (all(sp$cols %in% colnames(SQANTI_cell_summary)) && all(colSums(SQANTI_cell_summary[, sp$cols, drop = FALSE], na.rm = TRUE) == 0)) { + return(NULL) + } + pivot_violin(SQANTI_cell_summary, list( + name = sp$name, + columns = sp$cols, + title = sp$title, + x_labels = cat_labels_pretty, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = setNames(rep(sp$color, length(sp$cols)), sp$cols), + plot_args = list(violin_outline_fill = TRUE) + )) + })) + + ## Good quality features combined figure + good_feature_cols <- c("TSSAnnotationSupport_prop") + + if ("CAGE_peak_support_prop" %in% colnames(SQANTI_cell_summary)) { + good_feature_cols <- c(good_feature_cols, "CAGE_peak_support_prop") + } + if ("PolyA_motif_support_prop" %in% colnames(SQANTI_cell_summary)) { + good_feature_cols <- c(good_feature_cols, "PolyA_motif_support_prop") + } + good_feature_cols <- c(good_feature_cols, "Canonical_prop_in_cell") + + if ("srjunctions_support_prop" %in% colnames(SQANTI_cell_summary) && sum(SQANTI_cell_summary$srjunctions_support_prop, na.rm = TRUE) > 0) { + good_feature_cols <- c(good_feature_cols, "srjunctions_support_prop") + } + if ("TSS_ratio_validated_prop" %in% colnames(SQANTI_cell_summary) && sum(SQANTI_cell_summary$TSS_ratio_validated_prop, na.rm = TRUE) > 0) { + good_feature_cols <- c(good_feature_cols, "TSS_ratio_validated_prop") + } + + color_map <- c( + "TSSAnnotationSupport_prop" = "#66C2A4", + "CAGE_peak_support_prop" = "#EE6A50", + "PolyA_motif_support_prop" = "#78C679", + "Canonical_prop_in_cell" = "#CC6633", + "srjunctions_support_prop" = "#cd4f39", + "TSS_ratio_validated_prop" = "#ffc125" + ) + label_map <- c( + "TSSAnnotationSupport_prop" = "TSS Annotated", + "CAGE_peak_support_prop" = "Has Coverage CAGE", + "PolyA_motif_support_prop" = "Has PolyA Motif", + "Canonical_prop_in_cell" = "Canonical Junctions", + "srjunctions_support_prop" = "SJs Support by SRs", + "TSS_ratio_validated_prop" = "TSS Support by SRs" + ) + color_map <- color_map[good_feature_cols] + label_map <- label_map[good_feature_cols] + + pivot_violin(SQANTI_cell_summary, list( + name = "gg_good_feature", + columns = good_feature_cols, + title = "Good Quality Control Attributes Across Cells", + x_labels = label_map, + y_label = paste(entity_label_plural, ", %", sep = ""), + fill_map = color_map, + plot_args = list(violin_outline_fill = TRUE) + )) + + ### Exon structure across cells by structural category ### + { + cat_key_map <- structural_category_map + all_cats <- structural_category_levels + x_labels_pretty <- cat_labels_pretty + + fill_map_cat <- cat_fill_map + + # Build per-cell classification with FL-weighted count for ALL exon structure plots. + # In isoforms mode: explode comma-separated CB/FL so each row = (isoform, cell, count=FL). + # This means all exon metrics reflect actual expression levels, not just unique isoform counts. + if (mode == "isoforms" && "FL" %in% colnames(Classification_file) && + "CB" %in% colnames(Classification_file)) { + cls_valid <- Classification_file %>% + filter(!is.na(CB), CB != "", CB != "unassigned") %>% + mutate(CB_raw = as.character(CB), FL_raw = as.character(FL)) %>% + tidyr::separate_rows(CB_raw, FL_raw, sep = ",") %>% + mutate(CB = trimws(CB_raw), + count = suppressWarnings(as.numeric(trimws(FL_raw)))) %>% + filter(CB != "", CB != "unassigned", !is.na(count), count > 0) %>% + mutate(cat_key = unname(cat_key_map[structural_category])) %>% + filter(!is.na(cat_key)) + } else { + cls_valid <- Classification_file %>% + dplyr::filter(!is.na(CB) & CB != "" & CB != "unassigned") %>% + mutate(cat_key = unname(cat_key_map[structural_category]), + count = 1) %>% + filter(!is.na(cat_key)) + } + + # 1) FL-weighted mean exons per cell per category + exons_mean_by_cell <- cls_valid %>% + group_by(CB, cat_key) %>% + summarise( + mean_exons = sum(as.numeric(exons) * count, na.rm = TRUE) / sum(count, na.rm = TRUE), + .groups = "drop" + ) %>% + tidyr::complete(CB, cat_key = all_cats, fill = list(mean_exons = NA_real_)) + + df_exon_mean_long <- data.frame( + Variable = factor(exons_mean_by_cell$cat_key, levels = all_cats), + Value = exons_mean_by_cell$mean_exons + ) + + gg_exon_mean_by_category <<- build_violin_plot( + df_long = df_exon_mean_long, + title = paste("Mean Exons per", entity_label, "by Structural Category Across Cells"), + x_labels = x_labels_pretty, + fill_map = fill_map_cat, + y_label = paste("Exons per", entity_label), + legend = FALSE, + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + override_outline_vars = c("Genic"), + adjust = 1 + ) + + # 2) Percent mono-exonic reads per cell and category (FL-weighted in isoforms mode) + exons_bin_by_cell <- cls_valid %>% + mutate(is_mono = as.numeric(exons) == 1) %>% + group_by(CB, cat_key) %>% + summarise(total = sum(count, na.rm = TRUE), + mono = sum(count[is_mono], na.rm = TRUE), .groups = "drop") %>% + mutate(perc_mono = ifelse(total > 0, 100 * mono / total, NA_real_)) %>% + tidyr::complete(CB, cat_key = all_cats) + + df_exon_mono_long <- data.frame( + Variable = factor(exons_bin_by_cell$cat_key, levels = all_cats), + Value = exons_bin_by_cell$perc_mono + ) + + gg_exon_mono_by_category <<- build_violin_plot( + df_long = df_exon_mono_long, + title = paste("Mono-exonic", entity_label_plural, "by Structural Category Across Cells"), + x_labels = x_labels_pretty, + fill_map = fill_map_cat, + y_label = paste(entity_label_plural, ", %", sep = ""), + legend = FALSE, + ylim = c(0, 100), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + override_outline_vars = c("Genic") + ) + + # 3) Exon count bins per structural category across cells (HTML) + exon_bin_levels <- c("1", "2-3", "4-5", ">=6") + bin_fill_map <- setNames(c("#6BAED6", "#78C679", "#FC8D59", "#969696"), exon_bin_levels) + gg_exon_bins_by_category <<- list() + for (ck in all_cats) { + cat_df <- cls_valid %>% filter(cat_key == ck) + if (nrow(cat_df) == 0) { + next + } + bins_by_cell <- cat_df %>% + mutate( + exons_n = as.numeric(exons), + bin = dplyr::case_when( + exons_n <= 1 ~ "1", + exons_n <= 3 ~ "2-3", + exons_n <= 5 ~ "4-5", + TRUE ~ ">=6" + ) + ) %>% + group_by(CB, bin) %>% + summarise(n = sum(count, na.rm = TRUE), .groups = "drop") %>% + group_by(CB) %>% + mutate(perc = ifelse(sum(n) > 0, 100 * n / sum(n), NA_real_)) %>% + ungroup() %>% + tidyr::complete(CB, bin = exon_bin_levels) + + df_long_bins <- data.frame( + Variable = factor(bins_by_cell$bin, levels = exon_bin_levels), + Value = bins_by_cell$perc + ) + + pretty_name <- switch(ck, + FSM = "FSM", + ISM = "ISM", + NIC = "NIC", + NNC = "NNC", + Genic = "Genic Genomic", + Antisense = "Antisense", + Fusion = "Fusion", + Intergenic = "Intergenic", + Genic_intron = "Genic Intron", + ck + ) + + gg_exon_bins_by_category[[pretty_name]] <<- build_violin_plot( + df_long = df_long_bins, + title = paste0("Exon Count Bins in ", pretty_name, " Across Cells"), + x_labels = exon_bin_levels, + fill_map = bin_fill_map, + y_label = paste(entity_label_plural, ", %", sep = ""), + legend = FALSE, + ylim = c(0, 100), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20" + ) + } + + # 4) Exon count profile per category across cells (median + IQR) + K <- 20 + min_reads <- 5 + gg_exon_profile_by_category <<- list() + for (ck in all_cats) { + cat_df <- cls_valid %>% + mutate(cat_key = unname(cat_key_map[structural_category])) %>% + dplyr::filter(cat_key == ck) + if (nrow(cat_df) == 0) next + # Cells with at least min_reads (FL-weighted) in this category + cells_ok <- cat_df %>% + group_by(CB) %>% + summarise(total = sum(count, na.rm = TRUE), .groups = "drop") %>% + filter(total >= min_reads) %>% + pull(CB) + if (length(cells_ok) == 0) { + cells_ok <- unique(cat_df$CB) + } + cat_df2 <- cat_df %>% + filter(CB %in% cells_ok) %>% + mutate(exons_n = pmin(as.numeric(exons), K)) + # Per-cell PMF (FL-weighted: sum of FL counts in each exon bin) + pmf <- cat_df2 %>% + group_by(CB, exons_n) %>% + summarise(n = sum(count, na.rm = TRUE), .groups = "drop") %>% + group_by(CB) %>% + mutate(perc = ifelse(sum(n) > 0, 100 * n / sum(n), 0)) %>% + ungroup() %>% + tidyr::complete(CB, exons_n = seq_len(K), fill = list(n = 0, perc = 0)) + # Aggregate across cells + prof <- pmf %>% + group_by(exons_n) %>% + summarise( + mean = base::mean(perc, na.rm = TRUE), + median = stats::median(perc, na.rm = TRUE), + q1 = stats::quantile(perc, 0.25, na.rm = TRUE, type = 7), + q3 = stats::quantile(perc, 0.75, na.rm = TRUE, type = 7), + .groups = "drop" + ) %>% + rename(k = exons_n) + pretty_name <- switch(ck, + FSM = "FSM", + ISM = "ISM", + NIC = "NIC", + NNC = "NNC", + Genic = "Genic Genomic", + Antisense = "Antisense", + Fusion = "Fusion", + Intergenic = "Intergenic", + Genic_intron = "Genic Intron", + ck + ) + gg_exon_profile_by_category[[pretty_name]] <<- build_exon_profile_plot( + df_prof = prof, title = paste0("Exon Count Profile in ", pretty_name, " Across Cells"), + line_color = fill_map_cat[ck], k_max = K, y_label = paste(entity_label_plural, ", %", sep = ""), n_cells = length(unique(cells_ok)) + ) + } + } + + ### Presets ### + ############### + + # t1 <- ttheme_default(core=list(core = list(fg_params = list(cex = 0.6)), + # colhead = list(fg_params = list(cex = 0.7)))) + + # Resolve common isoform ID key between Junctions and Classification_file once, + # shared by all junction blocks below (HTML junc_aug_html, junc_rt, and PDF junc_aug). + junc_iso_key <- NULL + for (.k in c("isoform", "readID", "read_id", "ID", "read_name", "read")) { + if (.k %in% colnames(Junctions) && .k %in% colnames(Classification_file)) { + junc_iso_key <- .k + break + } + } + + # Build SJ per-type/per-category plots and the all-canonical grouped plot for HTML (and reuse for PDF) + # This block creates plot objects regardless of generate_pdf so the Rmd can render them. + { + # Build junc_aug_html: in isoforms mode, Junctions$CB is comma-separated — must explode. + if (mode == "isoforms" && !is.null(junc_iso_key) && + "FL" %in% colnames(Classification_file) && "CB" %in% colnames(Classification_file) && + junc_iso_key %in% colnames(Junctions)) { + + cls_exp_html <- Classification_file %>% + filter(!is.na(CB), !is.na(structural_category)) %>% + select(all_of(c(junc_iso_key, "CB", "FL", "structural_category"))) %>% + mutate(CB_raw = as.character(CB), FL_raw = as.character(FL)) %>% + tidyr::separate_rows(CB_raw, FL_raw, sep = ",") %>% + mutate(CB_clean = trimws(CB_raw), + FL_num = suppressWarnings(as.numeric(trimws(FL_raw)))) %>% + filter(CB_clean != "", CB_clean != "unassigned", !is.na(FL_num), FL_num > 0) %>% + select(all_of(c(junc_iso_key, "CB_clean", "FL_num", "structural_category"))) + + junc_aug_html <- Junctions %>% + select(all_of(c(junc_iso_key, "junction_category", "canonical"))) %>% + mutate(junction_type = paste(junction_category, canonical, sep = "_")) %>% + inner_join(cls_exp_html, by = junc_iso_key) %>% + rename(CB = CB_clean, count = FL_num) + + } else { + junc_aug_html <- Junctions %>% + dplyr::filter(!is.na(CB) & CB != "" & CB != "unassigned") %>% + mutate(junction_type = paste(junction_category, canonical, sep = "_"), + count = 1) + if (!("structural_category" %in% colnames(junc_aug_html)) && !is.null(junc_iso_key)) { + junc_aug_html <- junc_aug_html %>% + left_join(Classification_file %>% select(all_of(c(junc_iso_key, "structural_category"))), + by = junc_iso_key) + } + } + + cat_key_map <- structural_category_map + all_cats <- structural_category_levels + x_labels_full <- cat_labels_pretty + + junc_summ_html <- junc_aug_html %>% + filter(!is.na(structural_category)) %>% + mutate(cat_key = unname(cat_key_map[structural_category])) %>% + filter(!is.na(cat_key)) %>% + group_by(CB, cat_key) %>% + summarise( + total = sum(count, na.rm = TRUE), + known_canonical = sum(count[junction_type == "known_canonical"], na.rm = TRUE), + known_non_canonical = sum(count[junction_type == "known_non_canonical"], na.rm = TRUE), + novel_canonical = sum(count[junction_type == "novel_canonical"], na.rm = TRUE), + novel_non_canonical = sum(count[junction_type == "novel_non_canonical"], na.rm = TRUE), + .groups = "drop" + ) %>% + mutate( + KnownCanonicalPerc = ifelse(total > 0, 100 * known_canonical / total, NA_real_), + KnownNonCanonicalPerc = ifelse(total > 0, 100 * known_non_canonical / total, NA_real_), + NovelCanonicalPerc = ifelse(total > 0, 100 * novel_canonical / total, NA_real_), + NovelNonCanonicalPerc = ifelse(total > 0, 100 * novel_non_canonical / total, NA_real_) + ) %>% + tidyr::complete(CB, cat_key = all_cats) %>% + ungroup() + + make_df_long_html <- function(col_name) { + data.frame(Variable = factor(junc_summ_html$cat_key, levels = all_cats), Value = junc_summ_html[[col_name]]) + } + + fill_map_cat <- cat_fill_map + + gg_known_canon_by_category <<- build_violin_plot( + df_long = make_df_long_html("KnownCanonicalPerc"), + title = "", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Known Canonical Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + + gg_known_noncanon_by_category <<- build_violin_plot( + df_long = make_df_long_html("KnownNonCanonicalPerc"), + title = "", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Known Non-canonical Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + + gg_novel_canon_by_category <<- build_violin_plot( + df_long = make_df_long_html("NovelCanonicalPerc"), + title = "", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Novel Canonical Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + + gg_novel_noncanon_by_category <<- build_violin_plot( + df_long = make_df_long_html("NovelNonCanonicalPerc"), + title = "", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Novel Non-canonical Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + + # Stack the four SJ type-by-category plots into one figure + tick_angle_val <- 45 + + p_known_canon_by_category <- build_violin_plot( + df_long = make_df_long_html("KnownCanonicalPerc"), + title = "", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Known Canonical Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + + p_known_noncanon_by_category <- build_violin_plot( + df_long = make_df_long_html("KnownNonCanonicalPerc"), + title = "", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Known Non-canonical Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + + p_novel_canon_by_category <- build_violin_plot( + df_long = make_df_long_html("NovelCanonicalPerc"), + title = "", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Novel Canonical Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + + p_novel_noncanon_by_category <- build_violin_plot( + df_long = make_df_long_html("NovelNonCanonicalPerc"), + title = "", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Novel Non-canonical Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + + # Stack the four SJ type-by-category plots into one static figure using gridExtra + # Remove x-axis labels/titles for top 3 plots to mimic shared axis + p1 <- p_known_canon_by_category + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), axis.ticks.x = element_blank()) + p2 <- p_known_noncanon_by_category + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), axis.ticks.x = element_blank()) + p3 <- p_novel_canon_by_category + theme(axis.text.x = element_blank(), axis.title.x = element_blank(), axis.ticks.x = element_blank()) + p4 <- p_novel_noncanon_by_category # Keep x-axis for bottom plot + + # Create the static stack + gg_sj_type_by_category_stack <<- gridExtra::arrangeGrob( + p1, p2, p3, p4, + ncol = 1, + top = textGrob("Splice Junctions Distribution by Structural Category Across Cells", gp = gpar(fontsize = 18, fontface = "bold")) + ) + } + + # NEW: RT-switching by splice junction type across cells (all and unique junctions) + if ("RTS_junction" %in% colnames(Junctions)) { + # Normalize boolean + rts_bool_vec <- tolower(as.character(Junctions$RTS_junction)) %in% c("true", "t", "1", "yes") + + # In isoforms mode, Junctions$CB is a comma-separated list — must explode via classification + if (mode == "isoforms" && !is.null(junc_iso_key) && + "FL" %in% colnames(Classification_file) && "CB" %in% colnames(Classification_file) && + junc_iso_key %in% colnames(Junctions)) { + + # Explode Classification_file CB/FL to per-(isoform, cell, FL_count) + cls_exp_rts <- Classification_file %>% + filter(!is.na(CB)) %>% + select(all_of(c(junc_iso_key, "CB", "FL"))) %>% + mutate(CB_raw = as.character(CB), FL_raw = as.character(FL)) %>% + tidyr::separate_rows(CB_raw, FL_raw, sep = ",") %>% + mutate(CB_clean = trimws(CB_raw), + FL_num = suppressWarnings(as.numeric(trimws(FL_raw)))) %>% + filter(CB_clean != "", CB_clean != "unassigned", !is.na(FL_num), FL_num > 0) %>% + select(all_of(c(junc_iso_key, "CB_clean", "FL_num"))) + + # Join junctions (isoform, junction type, RTS flag) to exploded classification + junc_rt <- Junctions %>% + select(all_of(c(junc_iso_key, "junction_category", "canonical", "RTS_junction"))) %>% + mutate(SJ_type = paste(junction_category, canonical, sep = "_"), + RTS_bool = tolower(as.character(RTS_junction)) %in% c("true", "t", "1", "yes")) %>% + inner_join(cls_exp_rts, by = junc_iso_key) %>% + rename(CB = CB_clean, count = FL_num) + + } else { + # Reads mode: each junction already has one CB, count = 1 + junc_rt <- Junctions %>% + dplyr::filter(!is.na(CB) & CB != "" & CB != "unassigned") %>% + mutate(SJ_type = paste(junction_category, canonical, sep = "_"), + RTS_bool = rts_bool_vec, + count = 1) + } + + # Ensure consistent SJ type levels and labels + sj_levels <- c("known_canonical", "known_non_canonical", "novel_canonical", "novel_non_canonical") + sj_labels <- c("Known\nCanonical", "Known\nNon-canonical", "Novel\nCanonical", "Novel\nNon-canonical") + junc_rt$SJ_type <- factor(junc_rt$SJ_type, levels = sj_levels) + + # Color map consistent with other SJ type plots + sj_fill_map <- c( + known_canonical = "#6BAED6", + known_non_canonical = "goldenrod1", + novel_canonical = "#78C679", + novel_non_canonical = "#FC8D59" + ) + + # Per-cell percentages for ALL junctions (FL-weighted in isoforms mode) + all_junc_by_cell <- junc_rt %>% + group_by(CB, SJ_type) %>% + summarise(total = sum(count, na.rm = TRUE), + rts = sum(count[RTS_bool], na.rm = TRUE), .groups = "drop") %>% + # Only include (CB, SJ_type) that actually have data; NA for absent pairs + mutate(perc = ifelse(total > 0, 100 * rts / total, NA_real_)) %>% + tidyr::complete(CB, SJ_type = sj_levels) + + df_long_all <- data.frame( + Variable = factor(all_junc_by_cell$SJ_type, levels = sj_levels), + Value = all_junc_by_cell$perc + ) + + gg_rts_all_by_sjtype <<- build_violin_plot( + df_long = df_long_all, + title = "RT-switching All Junctions by Splice Junction Type Across Cells", + x_labels = sj_labels, + fill_map = sj_fill_map, + y_label = "Junctions, %", + legend = FALSE, + ylim = c(0, 100), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20" + ) + + # Unique-junction RTS plot: only meaningful in reads mode. + # In isoforms mode with FL weighting, grouping by junction position and summing FL + # gives the same total as "All Junctions", making this figure redundant. + if (mode != "isoforms") { + # Build a robust unique junction label if possible + if (!("junctionLabel" %in% colnames(junc_rt))) { + if (all(c("chrom", "strand", "genomic_start_coord", "genomic_end_coord") %in% colnames(junc_rt))) { + junc_rt$junctionLabel <- with(junc_rt, paste(chrom, strand, genomic_start_coord, genomic_end_coord, sep = "_")) + } else if (all(c("chrom", "strand", "genomic_start", "genomic_end") %in% colnames(junc_rt))) { + junc_rt$junctionLabel <- with(junc_rt, paste(chrom, strand, genomic_start, genomic_end, sep = "_")) + } else if ("junction_id" %in% colnames(junc_rt)) { + junc_rt$junctionLabel <- junc_rt$junction_id + } else { + # Fallback to row index within CB as unique proxy + junc_rt$junctionLabel <- paste0("jl_", seq_len(nrow(junc_rt))) + } + } + + # Per-cell percentages for UNIQUE junctions (deduplicate by genomic coordinates per cell & SJ type). + # "Unique" means: count each distinct junction site once per cell, regardless of FL count. + # The FL-weighted explode already gave us per-cell rows, so junctionLabel dedup collapses duplicates. + uniq_junc_by_cell <- junc_rt %>% + group_by(CB, SJ_type, junctionLabel) %>% + summarise(rts_any = any(RTS_bool, na.rm = TRUE), .groups = "drop") %>% + group_by(CB, SJ_type) %>% + summarise(total = dplyr::n(), rts = sum(rts_any), .groups = "drop") %>% + mutate(perc = ifelse(total > 0, 100 * rts / total, NA_real_)) %>% + tidyr::complete(CB, SJ_type = sj_levels) + + df_long_uniq <- data.frame( + Variable = factor(uniq_junc_by_cell$SJ_type, levels = sj_levels), + Value = uniq_junc_by_cell$perc + ) + + gg_rts_unique_by_sjtype <<- build_violin_plot( + df_long = df_long_uniq, + title = "RT-switching Unique Junctions by Splice Junction Type Across Cells", + x_labels = sj_labels, + fill_map = sj_fill_map, + y_label = "Junctions, %", + legend = FALSE, + ylim = c(0, 100), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20" + ) + } + } else { + message("RTS_junction column not found in Junctions. Skipping RT-switching by SJ type plots.") + } + + # Create grouped violins for % reads with all canonical junctions by structural category (HTML) + if (!exists("gg_allcanon_by_category")) { + # Build cls2: FL-exploded in isoforms mode so sum(count) = transcript counts + status_map <- function(x) { + xch <- tolower(as.character(x)) + ifelse(xch %in% c("true", "canonical", "yes"), "True", + ifelse(xch %in% c("false", "non_canonical", "no"), "False", + ifelse(is.logical(x) && x, "True", + ifelse(is.logical(x) && !x, "False", NA_character_) + ) + ) + ) + } + if (mode == "isoforms" && "FL" %in% colnames(Classification_file) && + "CB" %in% colnames(Classification_file)) { + cls2 <- Classification_file %>% + dplyr::filter(!is.na(CB) & CB != "" & CB != "unassigned") %>% + mutate( + CB = strsplit(as.character(CB), ","), + FL = strsplit(as.character(FL), ",") + ) %>% + tidyr::unnest(c(CB, FL)) %>% + mutate( + CB = trimws(CB), + count = suppressWarnings(as.numeric(trimws(FL))) + ) %>% + mutate(count = ifelse(is.na(count) | !is.finite(count), 1, count)) %>% + dplyr::filter(!is.na(CB) & CB != "") + } else { + cls2 <- Classification_file %>% + dplyr::filter(!is.na(CB) & CB != "" & CB != "unassigned") %>% + mutate(count = 1) + } + cls2 <- cls2 %>% mutate(allcanon_status = status_map(all_canonical)) + + cat_key_map <- structural_category_map + all_cats <- structural_category_levels + bin_pretty_map <- c(FSM = "FSM", ISM = "ISM", NIC = "NIC", NNC = "NNC", + Genic = "Genic Genomic", Antisense = "Antisense", + Fusion = "Fusion", Intergenic = "Intergenic", + Genic_intron = "Genic Intron") + pretty_levels <- c("FSM", "ISM", "NIC", "NNC", "Genic Genomic", + "Antisense", "Fusion", "Intergenic", "Genic Intron") + + df_allcanon <- cls2 %>% + mutate(cat_key = unname(cat_key_map[structural_category])) %>% + filter(!is.na(cat_key), !is.na(allcanon_status)) %>% + group_by(CB, cat_key, allcanon_status) %>% + summarise(n = sum(count, na.rm = TRUE), .groups = "drop") %>% + group_by(CB, cat_key) %>% + mutate(perc = 100 * n / sum(n)) %>% + ungroup() %>% + tidyr::complete(CB, cat_key = all_cats, allcanon_status = c("True", "False"), fill = list(n = 0, perc = 0)) + + if (nrow(df_allcanon) > 0) { + df_allcanon$allcanon_status <- factor(df_allcanon$allcanon_status, levels = c("True", "False")) + cols_tf <- c("True" = "#6baed6", "False" = "#ffc125") + gg_allcanon_by_category <<- build_grouped_violin_plot( + df = df_allcanon %>% transmute(bin = unname(bin_pretty_map[as.character(cat_key)]), group = as.character(allcanon_status), value = perc), + bin_levels = pretty_levels, + group_levels = c("True", "False"), + title = paste(entity_label_plural, "with All Canonical Junctions Distribution by Structural Category Across Cells"), + fill_map = cols_tf, + legend_labels = c("True" = "True", "False" = "False"), + y_label = paste(entity_label_plural, ", %", sep = ""), + ylim = c(0, 100), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.08, + x_tickangle = 45, + violin_width = 0.45, + dodge_width = 0.8, + violangap = 0.05, + violingroupgap = 0.15, + legend_title = "all_canonical" + ) + } + } + + # Coding / Non-Coding / NMD Plots + if (exists("SQANTI_cell_summary")) { + # Check if Coding columns exist + # Coding: ends with "_coding_prop" but NOT "_non_coding_prop" + all_coding_like <- grep("_coding_prop$", colnames(SQANTI_cell_summary), value = TRUE) + non_coding_cols <- grep("_non_coding_prop$", colnames(SQANTI_cell_summary), value = TRUE) + coding_cols <- setdiff(all_coding_like, non_coding_cols) + + # NMD + if ("NMD_prop_in_cell" %in% colnames(SQANTI_cell_summary)) { + nmd_cat_cols <- grep(".*_NMD_prop$", colnames(SQANTI_cell_summary), value = TRUE) + if (length(nmd_cat_cols) > 0) { + df_nmd <- SQANTI_cell_summary %>% + select(CB, all_of(nmd_cat_cols)) %>% + pivot_longer(cols = all_of(nmd_cat_cols), names_to = "Variable", values_to = "Value") %>% + mutate( + Variable = gsub("_NMD_prop$", "", Variable) + ) + + # Helper to map tag to pretty label + tag_to_pretty <- function(tag) { + case_map <- c( + "FSM" = "FSM", "ISM" = "ISM", "NIC" = "NIC", "NNC" = "NNC", + "genic" = "Genic Genomic", "antisense" = "Antisense", "fusion" = "Fusion", + "intergenic" = "Intergenic", "genic_intron" = "Genic Intron" + ) + if (tag %in% names(case_map)) { + return(case_map[[tag]]) + } + return(tag) + } + + df_nmd$PrettyVar <- sapply(df_nmd$Variable, tag_to_pretty) + + # Filter to all known categories (including non-canonical) + all_nmd_vars <- c("FSM", "ISM", "NIC", "NNC", "genic", "antisense", "fusion", "intergenic", "genic_intron") + df_nmd <- df_nmd %>% filter(Variable %in% all_nmd_vars) + + # Factor levels + nmd_levels <- c("FSM", "ISM", "NIC", "NNC", "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron") + df_nmd$PrettyVar <- factor(df_nmd$PrettyVar, levels = nmd_levels) + + # Use grey color for all NMD plots + nmd_fill_map <- setNames(rep("#969696", length(nmd_levels)), nmd_levels) + + gg_nmd_by_category <<- build_violin_plot( + df_long = data.frame(Variable = df_nmd$PrettyVar, Value = df_nmd$Value), + title = paste("Predicted NMD", entity_label_plural, "Distribution by Structural Category Across Cells"), + x_labels = levels(df_nmd$PrettyVar), + fill_map = nmd_fill_map, + y_label = paste(entity_label_plural, ", %", sep = ""), + legend = FALSE, + override_outline_vars = character(0), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + } + } + } + + ### Generate PDF report ### + ########################### + + if (generate_pdf) { + pdf(file.path(paste0(report_output, ".pdf")), paper = "a4r", width = 14, height = 11, useDingbats = FALSE) + # Cover page + grid.newpage() + title_text <- if (mode == "isoforms") "SQANTI-single cell\nisoforms report" else "SQANTI-single cell\nreads report" + cover <- textGrob(title_text, + gp = gpar(fontface = "italic", fontsize = 40, col = "orangered") + ) + grid.draw(cover) + # Overview tables + s <- textGrob("Overview", gp = gpar(fontface = "italic", fontsize = 30), vjust = 0) + grid.arrange(s) + + # Calculate bulk-level stats + if (mode == "isoforms") { + total_reads_count <- sum(Classification_file$count, na.rm = TRUE) + unique_isoforms <- nrow(Classification_file) + } else { + total_reads_count <- nrow(Classification_file) + } + unique_genes <- length(unique(Classification_file$associated_gene)) + if (mode == "isoforms") { + unique_junctions <- 0 + } else { + unique_junctions <- length(unique(Classification_file$jxn_string)) + } + + annotated_genes <- length(unique(Classification_file$associated_gene[!grepl("^novel", Classification_file$associated_gene)])) + novel_genes <- length(unique(Classification_file$associated_gene[grepl("^novel", Classification_file$associated_gene)])) + gene_class_table <- data.frame( + Category = c("Annotated Genes", "Novel Genes"), + `Genes, count` = c(annotated_genes, novel_genes), + `Genes, percent` = c( + round(100 * annotated_genes / unique_genes, 1), + round(100 * novel_genes / unique_genes, 1) + ), + check.names = FALSE + ) + + # Read Classification table (counts per structural category) + read_cat_levels <- c( + "full-splice_match", "incomplete-splice_match", "novel_in_catalog", "novel_not_in_catalog", + "genic", "antisense", "fusion", "intergenic", "genic_intron" + ) + read_cat_names <- c( + "FSM", "ISM", "NIC", "NNC", + "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron" + ) + + if (mode == "isoforms") { + read_class_table <- aggregate(Classification_file$count, by = list(Category = factor(Classification_file$structural_category, levels = read_cat_levels)), FUN = sum, na.rm = TRUE) + colnames(read_class_table) <- c("Category", "Transcripts, count") + # Ensure all levels are present + read_class_temp <- data.frame(Category = read_cat_levels) + read_class_table <- merge(read_class_temp, read_class_table, by = "Category", all.x = TRUE) + read_class_table <- read_class_table[match(read_cat_levels, read_class_table$Category), ] + read_class_table[is.na(read_class_table)] <- 0 + } else { + read_class_table <- as.data.frame(table(factor(Classification_file$structural_category, levels = read_cat_levels))) + colnames(read_class_table) <- c("Category", "Transcripts, count") + } + read_class_table$Category <- read_cat_names + read_class_table[["Transcripts, percent"]] <- round(100 * read_class_table[["Transcripts, count"]] / sum(read_class_table[["Transcripts, count"]]), 1) + + # Splice Junction Classification table + Junctions$junction_type <- paste(Junctions$junction_category, Junctions$canonical, sep = "_") + + sj_types <- c("known_canonical", "known_non_canonical", "novel_canonical", "novel_non_canonical") + if (mode == "isoforms") { + sj_counts <- sapply(sj_types, function(type) sum(Junctions$count[Junctions$junction_type == type], na.rm = TRUE)) + } else { + sj_counts <- sapply(sj_types, function(type) sum(Junctions$junction_type == type, na.rm = TRUE)) + } + + # Handle case where there are no junctions + total_junctions <- sum(sj_counts, na.rm = TRUE) + sj_perc <- if (total_junctions > 0) { + round(100 * sj_counts / total_junctions, 2) + } else { + rep(0, length(sj_counts)) + } + + SJ_class_table <- data.frame( + Category = c("Known canonical", "Known Non-canonical", "Novel canonical", "Novel Non-canonical"), + `SJs, count` = sj_counts, + `SJs, percent` = sj_perc, + check.names = FALSE + ) + rownames(SJ_class_table) <- NULL + + big_table_theme <- ttheme_default( + core = list(fg_params = list(cex = 1.1)), + colhead = list(fg_params = list(cex = 1.1, fontface = "bold")) + ) + + title_genes <- textGrob("Gene Classification", gp = gpar(fontface = "italic", fontsize = 20), vjust = -3) + title_reads <- textGrob(paste(entity_label, "Classification"), gp = gpar(fontface = "italic", fontsize = 20), vjust = -7.7) + title_sj <- textGrob("Splice Junction Classification", gp = gpar(fontface = "italic", fontsize = 20), vjust = -4.3) + + table_genes <- tableGrob(gene_class_table, rows = NULL, theme = big_table_theme) + table_reads <- tableGrob(read_class_table, rows = NULL, theme = big_table_theme) + table_sj <- tableGrob(SJ_class_table, rows = NULL, theme = big_table_theme) + + if (mode == "isoforms") { + unique_counts_text <- sprintf( + "Number of %s: %s\nUnique Isoforms: %s\nUnique Genes: %s", + entity_label_plural, + format(total_reads_count, big.mark = ","), + format(unique_isoforms, big.mark = ","), + format(unique_genes, big.mark = ",") + ) + } else if (unique_junctions > 0) { + unique_counts_text <- sprintf( + "Number of %s: %s\nUnique Genes: %s\nUnique Junction Chains: %s", + entity_label_plural, + format(total_reads_count, big.mark = ","), + format(unique_genes, big.mark = ","), + format(unique_junctions, big.mark = ",") + ) + } else { + unique_counts_text <- sprintf( + "Number of %s: %s\nUnique Genes: %s", + entity_label_plural, + format(total_reads_count, big.mark = ","), + format(unique_genes, big.mark = ",") + ) + } + unique_counts_grob <- textGrob( + unique_counts_text, + gp = gpar(fontface = "italic", fontsize = 28), vjust = 0, hjust = 0.5 + ) + + # Create gTree objects to overlay titles and tables + gt_genes <- gTree(children = gList(table_genes, title_genes)) + gt_reads <- gTree(children = gList(table_reads, title_reads)) + gt_sj <- gTree(children = gList(table_sj, title_sj)) + + # Arrange left column: Gene Classification + Splice Junction Classification + left_col <- arrangeGrob( + gt_genes, + gt_sj, + ncol = 1, + heights = c(0.2, 0.4) + ) + + # Arrange right column: Read Classification + right_col <- arrangeGrob( + gt_reads, + ncol = 1 + ) + + # Final page layout + grid.arrange( + unique_counts_grob, + arrangeGrob(left_col, right_col, ncol = 2, widths = c(1.3, 1.3)), + nrow = 2, + heights = c(0.8, 1) + ) + + # Single cell tables + s <- textGrob("Cell summary", gp = gpar(fontface = "italic", fontsize = 30), vjust = 0) + grid.arrange(s) + + # Number of cells + num_cells <- nrow(SQANTI_cell_summary) + num_cells_grob <- textGrob( + sprintf("Unique Cell Barcodes: %d", num_cells), + gp = gpar(fontface = "italic", fontsize = 28), vjust = 0.5, hjust = 0.5 + ) + + # 1. Unique Genes and Unique Junction Chains summary table + unique_genes_stats <- c( + Mean = mean(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), + Median = median(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), + Min = min(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), + Max = max(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), + IQR = IQR(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), + SD = sd(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE) + ) + unique_junctions_stats <- c( + Mean = mean(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), + Median = median(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), + Min = min(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), + Max = max(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), + IQR = IQR(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), + SD = sd(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE) + ) + reads_stats <- c( + Mean = mean(SQANTI_cell_summary[[count_col]], na.rm = TRUE), + Median = median(SQANTI_cell_summary[[count_col]], na.rm = TRUE), + Min = min(SQANTI_cell_summary[[count_col]], na.rm = TRUE), + Max = max(SQANTI_cell_summary[[count_col]], na.rm = TRUE), + IQR = IQR(SQANTI_cell_summary[[count_col]], na.rm = TRUE), + SD = sd(SQANTI_cell_summary[[count_col]], na.rm = TRUE) + ) + umis_stats <- c( + Mean = mean(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), + Median = median(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), + Min = min(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), + Max = max(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), + IQR = IQR(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), + SD = sd(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE) + ) + summary_table1 <- data.frame( + Feature = c(paste(entity_label_plural, "in cell"), "UMIs in cell", "Unique Genes", "Unique Junction Chains"), + Mean = c(reads_stats["Mean"], umis_stats["Mean"], unique_genes_stats["Mean"], unique_junctions_stats["Mean"]), + Median = c(reads_stats["Median"], umis_stats["Median"], unique_genes_stats["Median"], unique_junctions_stats["Median"]), + Min = c(reads_stats["Min"], umis_stats["Min"], unique_genes_stats["Min"], unique_junctions_stats["Min"]), + Max = c(reads_stats["Max"], umis_stats["Max"], unique_genes_stats["Max"], unique_junctions_stats["Max"]), + IQR = c(reads_stats["IQR"], umis_stats["IQR"], unique_genes_stats["IQR"], unique_junctions_stats["IQR"]), + SD = c(reads_stats["SD"], umis_stats["SD"], unique_genes_stats["SD"], unique_junctions_stats["SD"]) + ) + # If isoforms mode, drop Unique Junction Chains from summary table + if (mode == "isoforms" || !("UJCs_in_cell" %in% names(SQANTI_cell_summary)) || all(is.na(SQANTI_cell_summary$UJCs_in_cell)) || max(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE) == 0) { + summary_table1 <- summary_table1[!(summary_table1$Feature %in% c("Unique Junction Chains", "UMIs in cell")), , drop = FALSE] + } + summary_table1[, 2:7] <- round(summary_table1[, 2:7], 3) + table_summary1 <- tableGrob(summary_table1, rows = NULL, theme = big_table_theme) + gt_summary1 <- gTree(children = gList(table_summary1)) + + # 2. Gene Classification summary table (across all cells) + gene_class_stats <- data.frame( + Category = c("Annotated Genes", "Novel Genes"), + Mean = c(mean(SQANTI_cell_summary$Annotated_genes, na.rm = TRUE), mean(SQANTI_cell_summary$Novel_genes, na.rm = TRUE)), + Median = c(median(SQANTI_cell_summary$Annotated_genes, na.rm = TRUE), median(SQANTI_cell_summary$Novel_genes, na.rm = TRUE)), + Min = c(min(SQANTI_cell_summary$Annotated_genes, na.rm = TRUE), min(SQANTI_cell_summary$Novel_genes, na.rm = TRUE)), + Max = c(max(SQANTI_cell_summary$Annotated_genes, na.rm = TRUE), max(SQANTI_cell_summary$Novel_genes, na.rm = TRUE)), + SD = c(sd(SQANTI_cell_summary$Annotated_genes, na.rm = TRUE), sd(SQANTI_cell_summary$Novel_genes, na.rm = TRUE)) + ) + gene_class_stats[, 2:6] <- round(gene_class_stats[, 2:6], 3) + table_gene_class_stats <- tableGrob(gene_class_stats, rows = NULL, theme = big_table_theme) + title_gene_class_stats <- textGrob("Gene Classification (per cell)", gp = gpar(fontface = "italic", fontsize = 22), vjust = -2.9) + gt_gene_class_stats <- gTree(children = gList(table_gene_class_stats, title_gene_class_stats)) + + # 3. Splice Junction Classification summary table (across all cells) + + # Create a junction type column for easier summarization + Junctions$junction_type <- paste(Junctions$junction_category, Junctions$canonical, sep = "_") + + # Calculate proportions of each junction type per cell + junction_proportions_per_cell <- Junctions %>% + filter(CB != "unassigned") %>% + group_by(CB) %>% + summarise( + Known_canonical = sum(junction_type == "known_canonical", na.rm = TRUE) / n() * 100, + Known_Non_canonical = sum(junction_type == "known_non_canonical", na.rm = TRUE) / n() * 100, + Novel_canonical = sum(junction_type == "novel_canonical", na.rm = TRUE) / n() * 100, + Novel_Non_canonical = sum(junction_type == "novel_non_canonical", na.rm = TRUE) / n() * 100, + .groups = "drop" + ) + + # Calculate summary statistics across all cells + sj_stats <- junction_proportions_per_cell %>% + select(-CB) %>% + summarise( + across( + everything(), + list( + Mean = ~ mean(.x, na.rm = TRUE), + Median = ~ median(.x, na.rm = TRUE), + Min = ~ min(.x, na.rm = TRUE), + Max = ~ max(.x, na.rm = TRUE), + SD = ~ sd(.x, na.rm = TRUE) + ) + ) + ) + + # Reshape the data for display + sj_stats_df <- sj_stats %>% + pivot_longer( + cols = everything(), + names_to = c("Category", ".value"), + names_pattern = "(.+)_(Mean|Median|Min|Max|SD)$" + ) %>% + mutate(Category = gsub("_", " ", Category)) + + # Ensure we have the expected number of columns before subsetting + if (ncol(sj_stats_df) >= 6) { + sj_stats_df[, 2:6] <- round(sj_stats_df[, 2:6], 3) + } else { + # If we have fewer columns, round all numeric columns except the first (Category) + numeric_cols <- sapply(sj_stats_df[, -1], is.numeric) + sj_stats_df[, -1][numeric_cols] <- round(sj_stats_df[, -1][numeric_cols], 3) + } + table_sj_stats <- tableGrob(sj_stats_df, rows = NULL, theme = big_table_theme) + title_sj_stats <- textGrob("Splice Junction Classification (per cell, %)", gp = gpar(fontface = "italic", fontsize = 22), vjust = -4.4) + gt_sj_stats <- gTree(children = gList(table_sj_stats, title_sj_stats)) + + grid.arrange( + num_cells_grob, + gt_summary1, + gt_gene_class_stats, + gt_sj_stats, + ncol = 1, + heights = c(0.3, 1, 0.7, 0.9) + ) + + # Cell Summary Statistics Page 2: Read Classification + title_read_class <- textGrob(paste(entity_label, "Classification"), gp = gpar(fontface = "italic", fontsize = 28), vjust = 0, hjust = 0.5) + desc_counts <- textGrob(paste("Summary of per cell", entity_label_lower, "counts by structural category"), gp = gpar(fontface = "italic", fontsize = 18), vjust = 0.5) + desc_props <- textGrob(paste("Summary of per cell", entity_label_lower, "percentages by structural category"), gp = gpar(fontface = "italic", fontsize = 18), vjust = 0.5) + struct_cat_cols <- c( + "FSM", "ISM", "NIC", "NNC", "Genic_Genomic", "Antisense", "Fusion", "Intergenic", "Genic_intron" + ) + struct_cat_names <- c( + "FSM", "ISM", "NIC", "NNC", + "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron" + ) + + # Smaller table theme for these two tables + small_table_theme <- ttheme_default( + core = list(fg_params = list(cex = 1.2)), + colhead = list(fg_params = list(cex = 1.2, fontface = "bold")) + ) + + # 1. Counts summary table + count_stats <- sapply(struct_cat_cols, function(col) { + vals <- SQANTI_cell_summary[[col]] + c( + Mean = mean(vals, na.rm = TRUE), + Median = median(vals, na.rm = TRUE), + Min = min(vals, na.rm = TRUE), + Max = max(vals, na.rm = TRUE), + IQR = IQR(vals, na.rm = TRUE), + SD = sd(vals, na.rm = TRUE) + ) + }) + count_stats_df <- data.frame( + Category = struct_cat_names, + t(count_stats) + ) + colnames(count_stats_df)[2:7] <- c("Mean", "Median", "Min", "Max", "IQR", "SD") + count_stats_df[, 2:7] <- round(count_stats_df[, 2:7], 3) + table_count_stats <- tableGrob(count_stats_df, rows = NULL, theme = small_table_theme) + + # 2. Proportions summary table + prop_cat_cols <- paste0(struct_cat_cols, "_prop") + prop_stats <- sapply(prop_cat_cols, function(col) { + vals <- SQANTI_cell_summary[[col]] + c( + Mean = mean(vals, na.rm = TRUE), + Median = median(vals, na.rm = TRUE), + Min = min(vals, na.rm = TRUE), + Max = max(vals, na.rm = TRUE), + IQR = IQR(vals, na.rm = TRUE), + SD = sd(vals, na.rm = TRUE) + ) + }) + prop_stats_df <- data.frame( + Category = struct_cat_names, + t(prop_stats) + ) + colnames(prop_stats_df)[2:7] <- c("Mean", "Median", "Min", "Max", "IQR", "SD") + prop_stats_df[, 2:7] <- round(prop_stats_df[, 2:7], 3) + table_prop_stats <- tableGrob(prop_stats_df, rows = NULL, theme = small_table_theme) + + grid.arrange( + title_read_class, + desc_counts, + table_count_stats, + desc_props, + table_prop_stats, + ncol = 1, + heights = c(0.3, 0.12, 1, 0.12, 1) + ) + + # Helper for section title pages + section_page <- function(title) { + grid.newpage() + grid.draw(textGrob(title, gp = gpar(fontface = "italic", fontsize = 30, col = "black"))) + } + + # Per-cell Library Size section + section_page("Per-cell Library Size") + render_pdf_plot_centered("gg_reads_in_cells", width_frac = 0.5) + render_pdf_plot_centered("gg_umis_in_cells", width_frac = 0.5) + if (exists("gg_JCs_in_cell")) render_pdf_plot_centered("gg_JCs_in_cell", width_frac = 0.5) + + # Gene Characterization section + section_page("Gene Characterization") + # Genes Across Cells + render_pdf_plot_centered("gg_genes_in_cells", width_frac = 0.5) + render_pdf_plot("gg_annotation_of_genes_in_cell") + render_pdf_plot("gg_annotation_of_genes_percent_in_cell") + # Reads per Gene + render_pdf_plot("gg_annotation_of_reads_in_cell") + render_pdf_plot("gg_read_bins_all") + render_pdf_plot("gg_read_bins") + if (mode == "isoforms" && exists("gg_isoform_bins")) { + render_pdf_plot("gg_isoform_bins") + } + # UJCs per Gene + if (mode != "isoforms" && exists("gg_ujc_bins_all")) { + render_pdf_plot("gg_ujc_bins_all") + render_pdf_plot("gg_ujc_bins") + } + # Mitochondrial genes + render_pdf_plot("gg_MT_perc") + + # Read Length Characterization section + section_page(paste(entity_label, "Length Characterization")) + # Bulk Length Distribution + render_pdf_plot("gg_bulk_all_reads") + render_pdf_plot("gg_bulk_length_by_category") + render_pdf_plot("gg_bulk_length_by_exon_type") + # Overall cell-level distributions: All then Mono on next page + render_pdf_plot("gg_read_distr") + render_pdf_plot("gg_read_distr_mono") + # Category-specific: print All then Mono right after + for (tag in c("FSM", "ISM", "NIC", "NNC", "genic", "antisense", "fusion", "intergenic", "genic_intron")) { + all_nm <- paste0("gg_", tag, "_read_distr") + mono_nm <- paste0("gg_", tag, "_mono_read_distr") + if (exists(all_nm)) render_pdf_plot(all_nm) + if (exists(mono_nm)) render_pdf_plot(mono_nm) + } + # Reference Transcript Coverage + render_pdf_plot("gg_ref_coverage_across_category") + render_pdf_plot("gg_meta_transcript_coverage") + if (exists("gg_isoforms_ref_vs_sample_lengths") && !is.null(gg_isoforms_ref_vs_sample_lengths)) { + render_pdf_plot("gg_isoforms_ref_vs_sample_lengths") + } + + # Structural Read Characterization section + section_page(paste("Structural", entity_label, "Characterization")) + # Distribution by Structural Categories + render_pdf_plot("gg_SQANTI_across_category") + render_pdf_plot("gg_exon_mono_by_category") + for (nm in c( + "gg_SQANTI_across_FSM", "gg_SQANTI_across_ISM", "gg_SQANTI_across_NIC", "gg_SQANTI_across_NNC", + "gg_SQANTI_across_Genic", "gg_SQANTI_across_Antisense", "gg_SQANTI_across_Fusion", "gg_SQANTI_across_Intergenic", + "gg_SQANTI_across_Genic_Intron" + )) { + render_pdf_plot(nm) + } + # Exon Counts + render_pdf_plot("gg_exon_mean_by_category") + if (exists("gg_exon_profile_by_category")) { + prof_order <- c("FSM", "ISM", "NIC", "NNC", "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron") + for (nm in prof_order) { + if (!is.null(gg_exon_profile_by_category[[nm]])) { + print(gg_exon_profile_by_category[[nm]]) + } + } + } + + # Coding and Non-Coding Distributions + if (!skipORF) { + if (exists("gg_coding_across_category")) render_pdf_plot("gg_coding_across_category") + if (exists("gg_non_coding_across_category")) render_pdf_plot("gg_non_coding_across_category") + } + + # Splice Junction Characterization section + # Compute per-structural-category SJ distributions for PDF pages + + # Find the common isoform ID column between Junctions and Classification_file + junc_iso_key <- NULL + for (k in c("isoform", "readID", "read_id", "ID", "read_name", "read")) { + if (k %in% colnames(Junctions) && k %in% colnames(Classification_file)) { + junc_iso_key <- k + break + } + } + + if (mode == "isoforms" && !is.null(junc_iso_key) && + "FL" %in% colnames(Classification_file) && "CB" %in% colnames(Classification_file)) { + # Explode Classification_file CB/FL to get one row per (isoform, cell, FL_count) + cls_exploded <- Classification_file %>% + filter(!is.na(CB), !is.na(structural_category)) %>% + select(all_of(c(junc_iso_key, "CB", "FL", "structural_category"))) %>% + mutate(CB_raw = as.character(CB), FL_raw = as.character(FL)) %>% + tidyr::separate_rows(CB_raw, FL_raw, sep = ",") %>% + mutate( + CB_clean = trimws(CB_raw), + FL_num = suppressWarnings(as.numeric(trimws(FL_raw))) + ) %>% + filter(CB_clean != "", CB_clean != "unassigned", !is.na(FL_num), FL_num > 0) %>% + select(all_of(c(junc_iso_key, "CB_clean", "FL_num", "structural_category"))) + + # Join junctions to exploded classification by isoform key only. + # Select only junction-type columns from Junctions first to avoid + # column name collisions with the CB/count columns in cls_exploded. + junc_aug <- Junctions %>% + select(all_of(c(junc_iso_key, "junction_category", "canonical"))) %>% + mutate(junction_type = paste(junction_category, canonical, sep = "_")) %>% + inner_join(cls_exploded, by = junc_iso_key) %>% + rename(CB = CB_clean, count = FL_num) + + } else { + # Reads mode: each junction row already has a single CB; count = 1 + junc_aug <- Junctions %>% + dplyr::filter(!is.na(CB) & CB != "" & CB != "unassigned") %>% + mutate(junction_type = paste(junction_category, canonical, sep = "_"), + count = 1) + + # Bring in structural_category if not already present + if (!("structural_category" %in% colnames(junc_aug)) && !is.null(junc_iso_key)) { + junc_aug <- junc_aug %>% + left_join( + Classification_file %>% select(all_of(c(junc_iso_key, "structural_category"))), + by = junc_iso_key + ) + } + } + + cat_key_map <- structural_category_map + all_cats <- structural_category_levels + + junc_summ <- junc_aug %>% + filter(!is.na(structural_category)) %>% + mutate(cat_key = unname(cat_key_map[structural_category])) %>% + filter(!is.na(cat_key)) %>% + group_by(CB, cat_key) %>% + summarise( + total = sum(count, na.rm = TRUE), + known_canonical = sum(count[junction_type == "known_canonical"], na.rm = TRUE), + known_non_canonical = sum(count[junction_type == "known_non_canonical"], na.rm = TRUE), + novel_canonical = sum(count[junction_type == "novel_canonical"], na.rm = TRUE), + novel_non_canonical = sum(count[junction_type == "novel_non_canonical"], na.rm = TRUE), + .groups = "drop" + ) %>% + # Do NOT fill absent (CB, cat_key) pairs with 0 — cells with no transcripts + # of a given category should be NA (excluded from violin), not 0%. + mutate( + KnownCanonicalPerc = ifelse(total > 0, 100 * known_canonical / total, NA_real_), + KnownNonCanonicalPerc = ifelse(total > 0, 100 * known_non_canonical / total, NA_real_), + NovelCanonicalPerc = ifelse(total > 0, 100 * novel_canonical / total, NA_real_), + NovelNonCanonicalPerc = ifelse(total > 0, 100 * novel_non_canonical / total, NA_real_) + ) %>% + # Expand to all categories so every category column exists for plotting, + # but keep NA for cells absent from that category + tidyr::complete(CB, cat_key = all_cats) %>% + ungroup() + + # Prepare plotting helpers + fill_map_cat <- cat_fill_map + x_labels_full <- cat_labels_pretty + make_df_long <- function(col_name) { + data.frame(Variable = factor(junc_summ$cat_key, levels = all_cats), Value = junc_summ[[col_name]]) + } + + p_known_can <- build_violin_plot( + df_long = make_df_long("KnownCanonicalPerc"), + title = "Known Canonical Splice Junctions Distribution by Structural Category Across Cells", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + p_known_noncan <- build_violin_plot( + df_long = make_df_long("KnownNonCanonicalPerc"), + title = "Known Non-canonical Splice Junctions Distribution by Structural Category Across Cells", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + p_novel_can <- build_violin_plot( + df_long = make_df_long("NovelCanonicalPerc"), + title = "Novel Canonical Splice Junctions Distribution by Structural Category Across Cells", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + p_novel_noncan <- build_violin_plot( + df_long = make_df_long("NovelNonCanonicalPerc"), + title = "Novel Non-canonical Splice Junctions Distribution by Structural Category Across Cells", + x_labels = x_labels_full, + fill_map = fill_map_cat, + y_label = "Junctions, %", + legend = FALSE, + override_outline_vars = c("Genic"), + violin_alpha = 0.7, + box_alpha = 0.3, + box_width = 0.05, + x_tickangle = 45, + violin_outline_fill = TRUE, + box_outline_default = "grey20", + ylim = c(0, 100) + ) + + section_page("Splice Junction Characterization") + render_pdf_plot("gg_known_novel_canon") + print(p_known_can) + print(p_known_noncan) + print(p_novel_can) + print(p_novel_noncan) + render_pdf_plot("gg_allcanon_by_category") + if (exists("gg_rts_all_by_sjtype")) render_pdf_plot("gg_rts_all_by_sjtype") + if (exists("gg_rts_unique_by_sjtype")) render_pdf_plot("gg_rts_unique_by_sjtype") + + # Features of Bad Quality section + section_page("Features of Bad Quality") + render_pdf_plot("gg_bad_feature") + render_pdf_plot("gg_intrapriming_by_category") + render_pdf_plot("gg_RTS_by_category") + + render_pdf_plot("gg_noncanon_by_category") + if (exists("gg_nmd_by_category")) render_pdf_plot("gg_nmd_by_category") + + # Features of Good Quality section + section_page("Features of Good Quality") + render_pdf_plot("gg_good_feature") + render_pdf_plot("gg_tss_annotation_support") + if (CAGE_peak) render_pdf_plot("gg_cage_peak_support") + if (polyA_motif_list) render_pdf_plot("gg_polyA_motif_support") + render_pdf_plot("gg_canon_by_category") + if (exists("gg_sr_support_by_category")) render_pdf_plot("gg_sr_support_by_category") + if (exists("gg_tss_validation_by_category")) render_pdf_plot("gg_tss_validation_by_category") + + # Clustering Analysis + if (exists("gg_umap") && !is.null(gg_umap)) { + section_page("Clustering analysis") + print(gg_umap) + + # Print UMAP by structural category if available (one per page) + if (exists("gg_umap_by_category") && !is.null(gg_umap_by_category)) { + for (cat_label in names(gg_umap_by_category)) { + # Print the UMAP + print(gg_umap_by_category[[cat_label]]) + } + + # Then print out ALL the corresponding Structural Categories Violin Plots + if (exists("gg_cat_cluster_plots") && !is.null(gg_cat_cluster_plots)) { + for (cat_label in names(gg_cat_cluster_plots)) { + print(gg_cat_cluster_plots[[cat_label]]) + } + } + + # Length Distribution by Cluster (PDF) + if (exists("gg_len_cluster_plots") && !is.null(gg_len_cluster_plots)) { + for (len_lbl in names(gg_len_cluster_plots)) { + if (!is.null(gg_len_cluster_plots[[len_lbl]])) { + print(gg_len_cluster_plots[[len_lbl]]) + } + } + } + } + + # Print Short Read Support by Cluster (Violin + UMAPs) + if (exists("gg_sr_cluster_plots") && !is.null(gg_sr_cluster_plots)) { + # UMAPs first (Global then Category-specific if available) + if (exists("gg_sr_umap_plots") && !is.null(gg_sr_umap_plots)) { + if (!is.null(gg_sr_umap_plots[["All Transcripts"]])) print(gg_sr_umap_plots[["All Transcripts"]]) + for (label in setdiff(names(gg_sr_umap_plots), "All Transcripts")) { + print(gg_sr_umap_plots[[label]]) + } + } + + # Violin plots + for (label in names(gg_sr_cluster_plots)) { + # Print ggplot for PDF + p_ggplot <- gg_sr_cluster_plots[[label]] + print(p_ggplot) + } + } + + # Print TSS Validation Support by Cluster (Violin + UMAPs) + if (exists("gg_tss_cluster_plots") && !is.null(gg_tss_cluster_plots)) { + # UMAPs first + if (exists("gg_tss_umap_plots") && !is.null(gg_tss_umap_plots)) { + if (!is.null(gg_tss_umap_plots[["All Transcripts"]])) print(gg_tss_umap_plots[["All Transcripts"]]) + for (label in setdiff(names(gg_tss_umap_plots), "All Transcripts")) { + print(gg_tss_umap_plots[[label]]) + } + } + + # Violin plots + for (label in names(gg_tss_cluster_plots)) { + # Print ggplot for PDF + p_ggplot <- gg_tss_cluster_plots[[label]] + print(p_ggplot) + } + } + } + + dev.off() + } +} + +Classification <- data.table::fread(class.file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, data.table = FALSE) +if (mode == "isoforms" && "FL" %in% colnames(Classification)) { + Classification$count <- sapply(strsplit(as.character(Classification$FL), ","), function(x) sum(as.numeric(x), na.rm = TRUE)) + Classification$count[is.na(Classification$count) | Classification$count == 0] <- 1 +} else { + Classification$count <- 1 +} +Junctions <- data.table::fread(junc.file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, data.table = FALSE) + +# Add count column to Junctions for weighted quantification +if (mode == "isoforms") { + # Try to join by isoform ID + join_key <- NULL + for (k in c("isoform", "readID", "read_id", "ID", "read_name", "read")) { + if (k %in% colnames(Junctions) && k %in% colnames(Classification)) { + join_key <- k + break + } + } + + if (!is.null(join_key)) { + # Assign isoform count to each junction row + + # Use match to be faster than merge/join for simple lookup + Junctions$count <- Classification$count[match(Junctions[[join_key]], Classification[[join_key]])] + # Handle NAs (shouldn't happen if consistent) + Junctions$count[is.na(Junctions$count)] <- 1 + } else { + Junctions$count <- 1 + } +} else { + Junctions$count <- 1 +} + +# Require precomputed cell summary produced by sqanti_sc.py +if (!is.null(cell_summary_path) && file.exists(cell_summary_path)) { + SQANTI_cell_summary <- data.table::fread(cell_summary_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE, data.table = FALSE) +} else { + stop("A precomputed cell summary is required. Pass --cell_summary from sqanti_sc.py.") +} + +# Generate reports based on format +if (report.format == "pdf" || report.format == "both") { + generate_sqantisc_plots( + SQANTI_cell_summary, + Classification, + Junctions, + report_output + ) +} + +if (report.format == "html" || report.format == "both") { + # Generate plots first so they're available for Rmd + if (report.format == "html") { + generate_sqantisc_plots( + SQANTI_cell_summary, + Classification, + Junctions, + report_output, + generate_pdf = FALSE + ) + } + # Set up HTML report generation + # Get the directory where this R script is located (utilities folder) + cmd_args <- commandArgs(trailingOnly = FALSE) + script_arg <- cmd_args[grep("--file=", cmd_args)] + if (length(script_arg) > 0) { + script_path <- gsub("--file=", "", script_arg) + script_dir <- dirname(normalizePath(script_path)) + } else { + stop("Cannot determine script location") + } + + rmd_file <- file.path(script_dir, "SQANTI-sc_report.Rmd") + css_file <- file.path(script_dir, "style.css") + + # Check if Rmd file exists + if (!file.exists(rmd_file)) { + stop( + "R Markdown file not found: ", rmd_file, + "\nPlease ensure SQANTI-sc_report.Rmd is in the same directory as this script." + ) + } + + # Copy CSS file to output directory if it exists + if (file.exists(css_file)) { + css_output <- file.path(dirname(report_output), "style.css") + file.copy(css_file, css_output, overwrite = TRUE) + } + + # Generate HTML report + html_output_file <- paste0(report_output, ".html") + + message("Generating HTML report...") + + rmarkdown::render( + input = rmd_file, + output_file = html_output_file, + output_dir = dirname(report_output), + intermediates_dir = dirname(report_output), + quiet = FALSE, + envir = globalenv() + ) + + # Cleanup: remove the copied CSS file + if (exists("css_output") && file.exists(css_output)) { + file.remove(css_output) + } + + message("HTML report generated: ", html_output_file) +} diff --git a/src/report_assets/SQANTI-sc_report.Rmd b/src/report_assets/SQANTI-sc_report.Rmd index 24028e5..18127a1 100644 --- a/src/report_assets/SQANTI-sc_report.Rmd +++ b/src/report_assets/SQANTI-sc_report.Rmd @@ -31,57 +31,19 @@ SQANTI-single cell reads report structure: knitr::opts_chunk$set(echo = FALSE, cache = FALSE, warning = FALSE, message = FALSE, results = "asis") # Helper function to check if data exists -has_data <- function(df) { - if (!is.null(df) && length(df) > 0 && is.data.frame(df) && nrow(df) > 0) { - return(TRUE) - } else { - return(FALSE) - } -} - -# Helper: render a mixed list of plotly widgets and static images in a simple responsive grid -render_mixed_grid <- function(plots, title = NULL, ncols = 2) { - if (!is.null(title)) cat("

", title, "

\n", sep = "") - cat('
') - basis <- sprintf("%.2f%%", 100 / ncols - 2) - for (p in plots) { - cat('
', sep = "") - print(p) - cat("
") - } - cat("
\n") -} +has_data <- function(df) !is.null(df) && is.data.frame(df) && nrow(df) > 0 # Load required libraries library(ggplot2) library(dplyr) library(tidyr) library(forcats) -library(plotly) library(DT) library(knitr) library(grid) library(gridExtra) library(grDevices) -# Helper: render ggplot or plotly consistently in HTML with optional sizing -render_ggplotly <- function(p, target_width = NULL, target_height = NULL) { - # If it's already a plotly object, apply layout sizing - if (inherits(p, "plotly")) { - if (!is.null(target_width) || !is.null(target_height)) { - p <- layout(p, width = target_width, height = target_height) - } - return(p) - } - # If it's a ggplot object, convert via ggplotly - if (inherits(p, "gg") || inherits(p, "ggplot")) { - gp <- plotly::ggplotly(p, width = target_width, height = target_height) - return(gp) - } - # Fallback placeholder to avoid knitting errors - plotly::plot_ly() %>% layout(title = list(text = "Plot unavailable")) -} - # Reusable table renderer: interactive, sortable, and consistent alignment render_sqanti_table <- function(df, digits = 3, page_len = 10) { stopifnot(is.data.frame(df)) @@ -91,11 +53,12 @@ render_sqanti_table <- function(df, digits = 3, page_len = 10) { dt <- DT::datatable( df, rownames = FALSE, + width = "100%", options = list( ordering = TRUE, pageLength = page_len, paging = FALSE, - autoWidth = TRUE, + autoWidth = FALSE, lengthChange = FALSE, searching = FALSE, info = FALSE, @@ -122,49 +85,13 @@ render_sqanti_table <- function(df, digits = 3, page_len = 10) { ) dt } - -# Fix Plotly UMAP style to match PDF -fix_umap_plotly <- function(p) { - # Convert to WebGL for performance (Disabled to fix blank plots) - # p <- plotly::toWebGL(p) - p_build <- plotly_build(p) - - # Adjust all traces that have a colorbar - for (i in seq_along(p_build$x$data)) { - if (!is.null(p_build$x$data[[i]]$marker$colorbar)) { - # Use update list to ensure we don't break structure - p_build$x$data[[i]]$marker$colorbar$len <- 0.8 - p_build$x$data[[i]]$marker$colorbar$thickness <- 30 - - # Ensure title is treated correctly. Colorbar title is usually a string or a list. - # If it is a string, we might need to make it a list to set side, or use a different approach. - # However, let's try setting it safely. - if (is.list(p_build$x$data[[i]]$marker$colorbar$title)) { - p_build$x$data[[i]]$marker$colorbar$title$side <- "top" - } else if (is.character(p_build$x$data[[i]]$marker$colorbar$title)) { - # If it's a string, convert to list - orig_title <- p_build$x$data[[i]]$marker$colorbar$title - p_build$x$data[[i]]$marker$colorbar$title <- list(text = orig_title, side = "top") - } - } - } - - # Enforce axis font sizes using layout function which is safer - p_build <- layout(p_build, - xaxis = list(tickfont = list(size = 14)), - yaxis = list(tickfont = list(size = 14)), - margin = list(t = 80) - ) - - return(p_build) -} ``` -# Bulk Summary +# Overview -```{r bulk-summary-text} +```{r overview-text} if (exists("Classification") && has_data(Classification)) { - # Calculate bulk-level stats + # Calculate bulk-like stats total_reads_count <- sum(Classification$count) unique_genes <- length(unique(Classification$associated_gene)) if ("jxn_string" %in% colnames(Classification)) { @@ -174,6 +101,10 @@ if (exists("Classification") && has_data(Classification)) { } cat(paste0("**Number of ", entity_label_plural, ":** "), format(total_reads_count, big.mark = ","), "\n\n") + if (mode == "isoforms") { + unique_isoforms <- nrow(Classification) + cat("**Unique Isoforms:** ", format(unique_isoforms, big.mark = ","), "\n\n") + } cat("**Unique Genes:** ", format(unique_genes, big.mark = ","), "\n\n") if (mode != "isoforms" && exists("unique_junctions") && !is.null(unique_junctions)) { cat("**Unique Junction Chains:** ", format(unique_junctions, big.mark = ","), "\n\n") @@ -185,7 +116,7 @@ if (exists("Classification") && has_data(Classification)) { ### Gene Classification -```{r bulk-gene-classification} +```{r overview-gene-classification} if (exists("Classification") && has_data(Classification)) { # Gene Classification table unique_genes <- length(unique(Classification$associated_gene)) @@ -194,11 +125,12 @@ if (exists("Classification") && has_data(Classification)) { gene_class_table <- data.frame( Category = c("Annotated Genes", "Novel Genes"), - Count = c(annotated_genes, novel_genes), - Percentage = c( + `Genes, count` = c(annotated_genes, novel_genes), + `Genes, percent` = c( round(100 * annotated_genes / unique_genes, 1), round(100 * novel_genes / unique_genes, 1) - ) + ), + check.names = FALSE ) rownames(gene_class_table) <- NULL @@ -208,7 +140,7 @@ if (exists("Classification") && has_data(Classification)) { ### `r paste(entity_label, "Classification")` -```{r bulk-read-classification} +```{r overview-read-classification} if (exists("Classification") && has_data(Classification)) { # Read Classification table (counts per structural category) read_cat_levels <- c( @@ -228,12 +160,13 @@ if (exists("Classification") && has_data(Classification)) { # Reorder read_class_table <- read_class_table[match(read_cat_levels, read_class_table$structural_category), ] - colnames(read_class_table) <- c("Category", "Count") + colnames(read_class_table) <- c("Category", "Transcripts, count") read_class_table$Category <- read_cat_names - read_class_table$Percentage <- round(100 * read_class_table$Count / sum(read_class_table$Count), 1) + read_class_table[["Transcripts, percent"]] <- round(100 * read_class_table[["Transcripts, count"]] / sum(read_class_table[["Transcripts, count"]]), 1) # Remove categories with 0 counts - read_class_table <- read_class_table[read_class_table$Count > 0, ] + count_col_name <- "Transcripts, count" + read_class_table <- read_class_table[read_class_table[[count_col_name]] > 0, ] rownames(read_class_table) <- NULL render_sqanti_table(read_class_table, digits = 2, page_len = 10) @@ -242,7 +175,7 @@ if (exists("Classification") && has_data(Classification)) { ### Splice Junction Classification -```{r bulk-splice-junction-classification} +```{r overview-splice-junction-classification} if (exists("Junctions") && has_data(Junctions)) { # Splice Junction Classification table Junctions$junction_type <- paste(Junctions$junction_category, Junctions$canonical, sep = "_") @@ -259,8 +192,9 @@ if (exists("Junctions") && has_data(Junctions)) { SJ_class_table <- data.frame( Category = c("Known canonical", "Known Non-canonical", "Novel canonical", "Novel Non-canonical"), - Count = sj_counts, - Percentage = sj_perc + `SJs, count` = sj_counts, + `SJs, percent` = sj_perc, + check.names = FALSE ) # Remove row names and clean up @@ -291,6 +225,7 @@ if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { Median = median(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), Min = min(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), Max = max(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), + IQR = IQR(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE), SD = sd(SQANTI_cell_summary$Genes_in_cell, na.rm = TRUE) ) unique_junctions_stats <- c( @@ -298,6 +233,7 @@ if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { Median = median(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), Min = min(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), Max = max(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), + IQR = IQR(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE), SD = sd(SQANTI_cell_summary$UJCs_in_cell, na.rm = TRUE) ) reads_stats <- c( @@ -305,6 +241,7 @@ if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { Median = median(SQANTI_cell_summary[[count_col]], na.rm = TRUE), Min = min(SQANTI_cell_summary[[count_col]], na.rm = TRUE), Max = max(SQANTI_cell_summary[[count_col]], na.rm = TRUE), + IQR = IQR(SQANTI_cell_summary[[count_col]], na.rm = TRUE), SD = sd(SQANTI_cell_summary[[count_col]], na.rm = TRUE) ) umis_stats <- c( @@ -312,6 +249,7 @@ if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { Median = median(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), Min = min(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), Max = max(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), + IQR = IQR(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE), SD = sd(SQANTI_cell_summary$UMIs_in_cell, na.rm = TRUE) ) summary_table1 <- data.frame( @@ -320,6 +258,7 @@ if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { Median = c(reads_stats["Median"], umis_stats["Median"], unique_genes_stats["Median"], unique_junctions_stats["Median"]), Min = c(reads_stats["Min"], umis_stats["Min"], unique_genes_stats["Min"], unique_junctions_stats["Min"]), Max = c(reads_stats["Max"], umis_stats["Max"], unique_genes_stats["Max"], unique_junctions_stats["Max"]), + IQR = c(reads_stats["IQR"], umis_stats["IQR"], unique_genes_stats["IQR"], unique_junctions_stats["IQR"]), SD = c(reads_stats["SD"], umis_stats["SD"], unique_genes_stats["SD"], unique_junctions_stats["SD"]) ) @@ -329,7 +268,7 @@ if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { summary_table1 <- summary_table1[!summary_table1$Feature == "Unique Junction Chains", ] } - summary_table1[, 2:6] <- round(summary_table1[, 2:6], 3) + summary_table1[, 2:7] <- round(summary_table1[, 2:7], 3) rownames(summary_table1) <- NULL render_sqanti_table(summary_table1, digits = 3, page_len = 10) @@ -356,6 +295,91 @@ if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { } ``` +### `r paste(entity_label, "Classification")` + +```{=html} + +``` + +
+ +```{r cells-read-counts} +if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { + struct_cat_cols <- c( + "FSM", "ISM", "NIC", "NNC", "Genic_Genomic", "Antisense", "Fusion", "Intergenic", "Genic_intron" + ) + struct_cat_names <- c( + "FSM", "ISM", "NIC", "NNC", "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron" + ) + count_stats <- sapply(struct_cat_cols, function(col) { + vals <- SQANTI_cell_summary[[col]] + c( + Mean = mean(vals, na.rm = TRUE), + Median = median(vals, na.rm = TRUE), + Min = min(vals, na.rm = TRUE), + Max = max(vals, na.rm = TRUE), + IQR = IQR(vals, na.rm = TRUE), + SD = sd(vals, na.rm = TRUE) + ) + }) + count_stats_df <- data.frame( + Category = struct_cat_names, + t(count_stats) + ) + colnames(count_stats_df)[2:7] <- c("Mean", "Median", "Min", "Max", "IQR", "SD") + count_stats_df[, 2:7] <- round(count_stats_df[, 2:7], 3) + rownames(count_stats_df) <- NULL + render_sqanti_table(count_stats_df, digits = 3, page_len = 10) +} +``` + +
+ +
+ +```{r cells-read-percentages} +if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { + struct_cat_cols <- c( + "FSM", "ISM", "NIC", "NNC", "Genic_Genomic", "Antisense", "Fusion", "Intergenic", "Genic_intron" + ) + struct_cat_names <- c( + "FSM", "ISM", "NIC", "NNC", "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron" + ) + prop_cat_cols <- paste0(struct_cat_cols, "_prop") + prop_stats <- sapply(prop_cat_cols, function(col) { + vals <- SQANTI_cell_summary[[col]] + c( + Mean = mean(vals, na.rm = TRUE), + Median = median(vals, na.rm = TRUE), + Min = min(vals, na.rm = TRUE), + Max = max(vals, na.rm = TRUE), + IQR = IQR(vals, na.rm = TRUE), + SD = sd(vals, na.rm = TRUE) + ) + }) + prop_stats_df <- data.frame( + Category = struct_cat_names, + t(prop_stats) + ) + colnames(prop_stats_df)[2:7] <- c("Mean", "Median", "Min", "Max", "IQR", "SD") + prop_stats_df[, 2:7] <- round(prop_stats_df[, 2:7], 3) + rownames(prop_stats_df) <- NULL + render_sqanti_table(prop_stats_df, digits = 3, page_len = 10) +} +``` + +
+ ### Splice Junction Classification ```{=html} @@ -471,89 +495,6 @@ if (exists("Junctions") && has_data(Junctions) && exists("SQANTI_cell_summary")
-### `r paste(entity_label, "Classification")` - -```{=html} - -``` - -
- -```{r cells-read-counts} -if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { - struct_cat_cols <- c( - "FSM", "ISM", "NIC", "NNC", "Genic_Genomic", "Antisense", "Fusion", "Intergenic", "Genic_intron" - ) - struct_cat_names <- c( - "FSM", "ISM", "NIC", "NNC", "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron" - ) - count_stats <- sapply(struct_cat_cols, function(col) { - vals <- SQANTI_cell_summary[[col]] - c( - Mean = mean(vals, na.rm = TRUE), - Median = median(vals, na.rm = TRUE), - Min = min(vals, na.rm = TRUE), - Max = max(vals, na.rm = TRUE), - SD = sd(vals, na.rm = TRUE) - ) - }) - count_stats_df <- data.frame( - Category = struct_cat_names, - t(count_stats) - ) - colnames(count_stats_df)[2:6] <- c("Mean", "Median", "Min", "Max", "SD") - count_stats_df[, 2:6] <- round(count_stats_df[, 2:6], 3) - rownames(count_stats_df) <- NULL - render_sqanti_table(count_stats_df, digits = 3, page_len = 10) -} -``` - -
- -
- -```{r cells-read-percentages} -if (exists("SQANTI_cell_summary") && has_data(SQANTI_cell_summary)) { - struct_cat_cols <- c( - "FSM", "ISM", "NIC", "NNC", "Genic_Genomic", "Antisense", "Fusion", "Intergenic", "Genic_intron" - ) - struct_cat_names <- c( - "FSM", "ISM", "NIC", "NNC", "Genic Genomic", "Antisense", "Fusion", "Intergenic", "Genic Intron" - ) - prop_cat_cols <- paste0(struct_cat_cols, "_prop") - prop_stats <- sapply(prop_cat_cols, function(col) { - vals <- SQANTI_cell_summary[[col]] - c( - Mean = mean(vals, na.rm = TRUE), - Median = median(vals, na.rm = TRUE), - Min = min(vals, na.rm = TRUE), - Max = max(vals, na.rm = TRUE), - SD = sd(vals, na.rm = TRUE) - ) - }) - prop_stats_df <- data.frame( - Category = struct_cat_names, - t(prop_stats) - ) - colnames(prop_stats_df)[2:6] <- c("Mean", "Median", "Min", "Max", "SD") - prop_stats_df[, 2:6] <- round(prop_stats_df[, 2:6], 3) - rownames(prop_stats_df) <- NULL - render_sqanti_table(prop_stats_df, digits = 3, page_len = 10) -} -``` - -
- ```{=html} @@ -659,8 +619,6 @@ if (mode != "isoforms") { } ``` - - ```{r depth-umis-check, include=FALSE} show_umis <- (mode != "isoforms" && exists("gg_umis_in_cells")) ``` @@ -670,7 +628,7 @@ if (show_umis) cat('
\n') ``` ```{r depth-umis, fig.height=6, fig.width=4, eval=show_umis} -print(plotly_to_ggplot(gg_umis_in_cells)) +print(gg_umis_in_cells) ``` ```{r depth-umis-div-end, results='asis'} @@ -692,7 +650,7 @@ if (show_ujcs) cat('
\n') ``` ```{r depth-ujcs, fig.height=6, fig.width=4, eval=show_ujcs} -print(plotly_to_ggplot(gg_JCs_in_cell)) +print(gg_JCs_in_cell) ``` ```{r depth-ujcs-div-end, results='asis'} @@ -732,7 +690,7 @@ if (exists("gg_genes_in_cells")) { ```{r genesacross-knownnovel, fig.height=8, fig.width=12} if (exists("gg_annotation_of_genes_in_cell")) { - print(plotly_to_ggplot(gg_annotation_of_genes_in_cell)) + print(gg_annotation_of_genes_in_cell) } ``` @@ -742,7 +700,7 @@ if (exists("gg_annotation_of_genes_in_cell")) { ```{r genesacross-percent, fig.height=8, fig.width=12} if (exists("gg_annotation_of_genes_percent_in_cell")) { - print(plotly_to_ggplot(gg_annotation_of_genes_percent_in_cell)) + print(gg_annotation_of_genes_percent_in_cell) } ``` @@ -773,7 +731,7 @@ if (exists("gg_annotation_of_genes_percent_in_cell")) { ```{r readspergene-perc-reads-annot, fig.height=8, fig.width=12} if (exists("gg_annotation_of_reads_in_cell")) { - print(plotly_to_ggplot(gg_annotation_of_reads_in_cell)) + print(gg_annotation_of_reads_in_cell) } ``` @@ -783,7 +741,7 @@ if (exists("gg_annotation_of_reads_in_cell")) { ```{r readspergene-bins-all, fig.height=8, fig.width=12} if (exists("gg_read_bins_all")) { - print(plotly_to_ggplot(gg_read_bins_all)) + print(gg_read_bins_all) } ``` @@ -793,7 +751,7 @@ if (exists("gg_read_bins_all")) { ```{r readspergene-bins-annot, fig.height=8, fig.width=12} if (exists("gg_read_bins")) { - print(plotly_to_ggplot(gg_read_bins)) + print(gg_read_bins) } ``` @@ -807,7 +765,7 @@ if (mode == "isoforms" && exists("gg_isoform_bins")) { ```{r iso-per-gene-plot, fig.height=8, fig.width=12} if (mode == "isoforms" && exists("gg_isoform_bins")) { - print(plotly_to_ggplot(gg_isoform_bins)) + print(gg_isoform_bins) } ``` @@ -843,7 +801,7 @@ show_ujcs_gene <- (mode != "isoforms" && exists("gg_ujc_bins_all")) ```{r ujcspergene-bins-all, fig.height=8, fig.width=12, eval=show_ujcs_gene} if (exists("gg_ujc_bins_all")) { - print(plotly_to_ggplot(gg_ujc_bins_all)) + print(gg_ujc_bins_all) } ``` @@ -854,7 +812,7 @@ if (exists("gg_ujc_bins_all")) { ```{r ujcspergene-bins-annot, fig.height=8, fig.width=12, eval=show_ujcs_gene} if (exists("gg_ujc_bins")) { - print(plotly_to_ggplot(gg_ujc_bins)) + print(gg_ujc_bins) } ``` @@ -870,7 +828,7 @@ if (exists("gg_ujc_bins")) { ```{r genechar-mt, fig.height=6, fig.width=4} if (exists("gg_MT_perc")) { - print(plotly_to_ggplot(gg_MT_perc)) + print(gg_MT_perc) } ``` @@ -878,8 +836,6 @@ if (exists("gg_MT_perc")) {
``` - - # `r paste(entity_label, "Length Characterization")` {.tabset .tabset-fade} ## Bulk Length Distribution @@ -903,7 +859,7 @@ if (exists("gg_MT_perc")) { ```{r bulk-length-all-plot, fig.height=8, fig.width=12} if (exists("gg_bulk_all_reads")) { - print(plotly_to_ggplot(gg_bulk_all_reads)) + print(gg_bulk_all_reads) } ``` @@ -913,7 +869,7 @@ if (exists("gg_bulk_all_reads")) { ```{r bulk-length-category-plot, fig.height=8, fig.width=12} if (exists("gg_bulk_length_by_category")) { - print(plotly_to_ggplot(gg_bulk_length_by_category)) + print(gg_bulk_length_by_category) } ``` @@ -923,7 +879,7 @@ if (exists("gg_bulk_length_by_category")) { ```{r bulk-length-exon-plot, fig.height=8, fig.width=12} if (exists("gg_bulk_length_by_exon_type")) { - print(plotly_to_ggplot(gg_bulk_length_by_exon_type)) + print(gg_bulk_length_by_exon_type) } ``` @@ -957,7 +913,7 @@ if (exists("gg_bulk_length_by_exon_type")) { ```{r multiexon-length-all-plot, fig.height=8, fig.width=12} if (exists("gg_read_distr")) { - print(plotly_to_ggplot(gg_read_distr)) + print(gg_read_distr) } ``` @@ -967,7 +923,7 @@ if (exists("gg_read_distr")) { ```{r multiexon-length-fsm-plot, fig.height=8, fig.width=12} if (exists("gg_FSM_read_distr")) { - print(plotly_to_ggplot(gg_FSM_read_distr)) + print(gg_FSM_read_distr) } ``` @@ -977,7 +933,7 @@ if (exists("gg_FSM_read_distr")) { ```{r multiexon-length-ism-plot, fig.height=8, fig.width=12} if (exists("gg_ISM_read_distr")) { - print(plotly_to_ggplot(gg_ISM_read_distr)) + print(gg_ISM_read_distr) } ``` @@ -987,7 +943,7 @@ if (exists("gg_ISM_read_distr")) { ```{r multiexon-length-nic-plot, fig.height=8, fig.width=12} if (exists("gg_NIC_read_distr")) { - print(plotly_to_ggplot(gg_NIC_read_distr)) + print(gg_NIC_read_distr) } ``` @@ -997,7 +953,7 @@ if (exists("gg_NIC_read_distr")) { ```{r multiexon-length-nnc-plot, fig.height=8, fig.width=12} if (exists("gg_NNC_read_distr")) { - print(plotly_to_ggplot(gg_NNC_read_distr)) + print(gg_NNC_read_distr) } ``` @@ -1007,7 +963,7 @@ if (exists("gg_NNC_read_distr")) { ```{r multiexon-length-genic-plot, fig.height=8, fig.width=12} if (exists("gg_genic_read_distr")) { - print(plotly_to_ggplot(gg_genic_read_distr)) + print(gg_genic_read_distr) } ``` @@ -1017,7 +973,7 @@ if (exists("gg_genic_read_distr")) { ```{r multiexon-length-antisense-plot, fig.height=8, fig.width=12} if (exists("gg_antisense_read_distr")) { - print(plotly_to_ggplot(gg_antisense_read_distr)) + print(gg_antisense_read_distr) } ``` @@ -1027,7 +983,7 @@ if (exists("gg_antisense_read_distr")) { ```{r multiexon-length-fusion-plot, fig.height=8, fig.width=12} if (exists("gg_fusion_read_distr")) { - print(plotly_to_ggplot(gg_fusion_read_distr)) + print(gg_fusion_read_distr) } ``` @@ -1037,7 +993,7 @@ if (exists("gg_fusion_read_distr")) { ```{r multiexon-length-intergenic-plot, fig.height=8, fig.width=12} if (exists("gg_intergenic_read_distr")) { - print(plotly_to_ggplot(gg_intergenic_read_distr)) + print(gg_intergenic_read_distr) } ``` @@ -1047,7 +1003,7 @@ if (exists("gg_intergenic_read_distr")) { ```{r multiexon-length-genicintron-plot, fig.height=8, fig.width=12} if (exists("gg_genic_intron_read_distr")) { - print(plotly_to_ggplot(gg_genic_intron_read_distr)) + print(gg_genic_intron_read_distr) } ``` @@ -1079,7 +1035,7 @@ if (exists("gg_genic_intron_read_distr")) { ```{r monoexon-length-all-plot, fig.height=8, fig.width=10} if (exists("gg_read_distr_mono")) { - print(plotly_to_ggplot(gg_read_distr_mono)) + print(gg_read_distr_mono) } ``` @@ -1089,7 +1045,7 @@ if (exists("gg_read_distr_mono")) { ```{r monoexon-length-fsm-plot, fig.height=8, fig.width=10} if (exists("gg_FSM_mono_read_distr")) { - print(plotly_to_ggplot(gg_FSM_mono_read_distr)) + print(gg_FSM_mono_read_distr) } ``` @@ -1099,7 +1055,7 @@ if (exists("gg_FSM_mono_read_distr")) { ```{r monoexon-length-ism-plot, fig.height=8, fig.width=10} if (exists("gg_ISM_mono_read_distr")) { - print(plotly_to_ggplot(gg_ISM_mono_read_distr)) + print(gg_ISM_mono_read_distr) } ``` @@ -1109,7 +1065,7 @@ if (exists("gg_ISM_mono_read_distr")) { ```{r monoexon-length-nic-plot, fig.height=8, fig.width=10} if (exists("gg_NIC_mono_read_distr")) { - print(plotly_to_ggplot(gg_NIC_mono_read_distr)) + print(gg_NIC_mono_read_distr) } ``` @@ -1119,7 +1075,7 @@ if (exists("gg_NIC_mono_read_distr")) { ```{r monoexon-length-genic-plot, fig.height=8, fig.width=10} if (exists("gg_genic_mono_read_distr")) { - print(plotly_to_ggplot(gg_genic_mono_read_distr)) + print(gg_genic_mono_read_distr) } ``` @@ -1129,7 +1085,7 @@ if (exists("gg_genic_mono_read_distr")) { ```{r monoexon-length-antisense-plot, fig.height=8, fig.width=10} if (exists("gg_antisense_mono_read_distr")) { - print(plotly_to_ggplot(gg_antisense_mono_read_distr)) + print(gg_antisense_mono_read_distr) } ``` @@ -1139,7 +1095,7 @@ if (exists("gg_antisense_mono_read_distr")) { ```{r monoexon-length-intergenic-plot, fig.height=8, fig.width=10} if (exists("gg_intergenic_mono_read_distr")) { - print(plotly_to_ggplot(gg_intergenic_mono_read_distr)) + print(gg_intergenic_mono_read_distr) } ``` @@ -1149,7 +1105,7 @@ if (exists("gg_intergenic_mono_read_distr")) { ```{r monoexon-length-genicintron-plot, fig.height=8, fig.width=10} if (exists("gg_genic_intron_mono_read_distr")) { - print(plotly_to_ggplot(gg_genic_intron_mono_read_distr)) + print(gg_genic_intron_mono_read_distr) } ``` @@ -1157,14 +1113,53 @@ if (exists("gg_genic_intron_mono_read_distr")) { ## Reference Transcript Coverage +```{=html} + +``` + +
+ ```{r ref-transcript-coverage, fig.height=8, fig.width=12} if (exists("gg_ref_coverage_across_category")) { - print(plotly_to_ggplot(gg_ref_coverage_across_category)) + print(gg_ref_coverage_across_category) +} +``` + +
+ + + +## Reference Transcriptome + +```{r isoforms-ref-lengths-plot, fig.height=8, fig.width=10} +if (exists("gg_isoforms_ref_vs_sample_lengths") && !is.null(gg_isoforms_ref_vs_sample_lengths)) { + print(gg_isoforms_ref_vs_sample_lengths) } ``` # `r paste(entity_label, "Structural Characterization")` {.tabset .tabset-fade} + ## Distribution by Structural Categories ```{=html} @@ -1195,7 +1190,7 @@ if (exists("gg_ref_coverage_across_category")) { ```{r rd-across-cat, fig.height=8, fig.width=12} if (exists("gg_SQANTI_across_category")) { - print(plotly_to_ggplot(gg_SQANTI_across_category)) + print(gg_SQANTI_across_category) } ``` @@ -1205,7 +1200,7 @@ if (exists("gg_SQANTI_across_category")) { ```{r rd-across-mono, fig.height=8, fig.width=12} if (exists("gg_exon_mono_by_category")) { - print(plotly_to_ggplot(gg_exon_mono_by_category)) + print(gg_exon_mono_by_category) } ``` @@ -1215,7 +1210,7 @@ if (exists("gg_exon_mono_by_category")) { ```{r rd-fsm, fig.height=8, fig.width=12} if (exists("gg_SQANTI_across_FSM")) { - print(plotly_to_ggplot(gg_SQANTI_across_FSM)) + print(gg_SQANTI_across_FSM) } ``` @@ -1225,7 +1220,7 @@ if (exists("gg_SQANTI_across_FSM")) { ```{r rd-ism, fig.height=8, fig.width=12} if (exists("gg_SQANTI_across_ISM")) { - print(plotly_to_ggplot(gg_SQANTI_across_ISM)) + print(gg_SQANTI_across_ISM) } ``` @@ -1235,7 +1230,7 @@ if (exists("gg_SQANTI_across_ISM")) { ```{r rd-nic, fig.height=8, fig.width=12} if (exists("gg_SQANTI_across_NIC")) { - print(plotly_to_ggplot(gg_SQANTI_across_NIC)) + print(gg_SQANTI_across_NIC) } ``` @@ -1245,7 +1240,7 @@ if (exists("gg_SQANTI_across_NIC")) { ```{r rd-nnc, fig.height=8, fig.width=12} if (exists("gg_SQANTI_across_NNC")) { - print(plotly_to_ggplot(gg_SQANTI_across_NNC)) + print(gg_SQANTI_across_NNC) } ``` @@ -1255,7 +1250,7 @@ if (exists("gg_SQANTI_across_NNC")) { ```{r rd-genic, fig.height=8, fig.width=12} if (exists("gg_SQANTI_across_Genic")) { - print(plotly_to_ggplot(gg_SQANTI_across_Genic)) + print(gg_SQANTI_across_Genic) } ``` @@ -1265,7 +1260,7 @@ if (exists("gg_SQANTI_across_Genic")) { ```{r rd-antisense, fig.height=8, fig.width=12} if (exists("gg_SQANTI_across_Antisense")) { - print(plotly_to_ggplot(gg_SQANTI_across_Antisense)) + print(gg_SQANTI_across_Antisense) } ``` @@ -1275,7 +1270,7 @@ if (exists("gg_SQANTI_across_Antisense")) { ```{r rd-fusion, fig.height=8, fig.width=12} if (exists("gg_SQANTI_across_Fusion")) { - print(plotly_to_ggplot(gg_SQANTI_across_Fusion)) + print(gg_SQANTI_across_Fusion) } ``` @@ -1285,7 +1280,7 @@ if (exists("gg_SQANTI_across_Fusion")) { ```{r rd-intergenic, fig.height=8, fig.width=12} if (exists("gg_SQANTI_across_Intergenic")) { - print(plotly_to_ggplot(gg_SQANTI_across_Intergenic)) + print(gg_SQANTI_across_Intergenic) } ``` @@ -1295,7 +1290,7 @@ if (exists("gg_SQANTI_across_Intergenic")) { ```{r rd-genic-intron, fig.height=8, fig.width=12} if (exists("gg_SQANTI_across_Genic_Intron")) { - print(plotly_to_ggplot(gg_SQANTI_across_Genic_Intron)) + print(gg_SQANTI_across_Genic_Intron) } ``` @@ -1330,7 +1325,7 @@ if (exists("gg_SQANTI_across_Genic_Intron")) { ```{r exoncounts-across, fig.height=8, fig.width=12} if (exists("gg_exon_mean_by_category")) { - print(plotly_to_ggplot(gg_exon_mean_by_category)) + print(gg_exon_mean_by_category) } ``` @@ -1340,7 +1335,7 @@ if (exists("gg_exon_mean_by_category")) { ```{r exoncounts-fsm-prof, fig.height=8, fig.width=12} if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_category[["FSM"]])) { - print(plotly_to_ggplot(gg_exon_profile_by_category[["FSM"]])) + print(gg_exon_profile_by_category[["FSM"]]) } ``` @@ -1350,7 +1345,7 @@ if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_categor ```{r exoncounts-ism-prof, fig.height=8, fig.width=12} if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_category[["ISM"]])) { - print(plotly_to_ggplot(gg_exon_profile_by_category[["ISM"]])) + print(gg_exon_profile_by_category[["ISM"]]) } ``` @@ -1360,7 +1355,7 @@ if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_categor ```{r exoncounts-nic-prof, fig.height=8, fig.width=12} if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_category[["NIC"]])) { - print(plotly_to_ggplot(gg_exon_profile_by_category[["NIC"]])) + print(gg_exon_profile_by_category[["NIC"]]) } ``` @@ -1370,7 +1365,7 @@ if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_categor ```{r exoncounts-nnc-prof, fig.height=8, fig.width=12} if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_category[["NNC"]])) { - print(plotly_to_ggplot(gg_exon_profile_by_category[["NNC"]])) + print(gg_exon_profile_by_category[["NNC"]]) } ``` @@ -1380,7 +1375,7 @@ if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_categor ```{r exoncounts-genic-genomic-prof, fig.height=8, fig.width=12} if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_category[["Genic Genomic"]])) { - print(plotly_to_ggplot(gg_exon_profile_by_category[["Genic Genomic"]])) + print(gg_exon_profile_by_category[["Genic Genomic"]]) } ``` @@ -1390,7 +1385,7 @@ if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_categor ```{r exoncounts-antisense-prof, fig.height=8, fig.width=12} if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_category[["Antisense"]])) { - print(plotly_to_ggplot(gg_exon_profile_by_category[["Antisense"]])) + print(gg_exon_profile_by_category[["Antisense"]]) } ``` @@ -1400,7 +1395,7 @@ if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_categor ```{r exoncounts-fusion-prof, fig.height=8, fig.width=12} if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_category[["Fusion"]])) { - print(plotly_to_ggplot(gg_exon_profile_by_category[["Fusion"]])) + print(gg_exon_profile_by_category[["Fusion"]]) } ``` @@ -1410,7 +1405,7 @@ if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_categor ```{r exoncounts-intergenic-prof, fig.height=8, fig.width=12} if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_category[["Intergenic"]])) { - print(plotly_to_ggplot(gg_exon_profile_by_category[["Intergenic"]])) + print(gg_exon_profile_by_category[["Intergenic"]]) } ``` @@ -1420,7 +1415,7 @@ if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_categor ```{r exoncounts-genic-intron-prof, fig.height=8, fig.width=12} if (exists("gg_exon_profile_by_category") && !is.null(gg_exon_profile_by_category[["Genic Intron"]])) { - print(plotly_to_ggplot(gg_exon_profile_by_category[["Genic Intron"]])) + print(gg_exon_profile_by_category[["Genic Intron"]]) } ``` @@ -1454,7 +1449,7 @@ show_coding <- (mode == "isoforms" && exists("gg_coding_across_category")) ```{r coding-coding, fig.height=8, fig.width=12, eval=show_coding} if (exists("gg_coding_across_category")) { - print(plotly_to_ggplot(gg_coding_across_category)) + print(gg_coding_across_category) } ``` @@ -1466,7 +1461,7 @@ if (exists("gg_coding_across_category")) { ```{r coding-noncoding, fig.height=8, fig.width=12, eval=show_coding} if (exists("gg_non_coding_across_category")) { - print(plotly_to_ggplot(gg_non_coding_across_category)) + print(gg_non_coding_across_category) } ``` @@ -1497,7 +1492,7 @@ if (exists("gg_non_coding_across_category")) { ```{r sj-dist-class, fig.height=8, fig.width=12} if (exists("gg_known_novel_canon")) { - print(plotly_to_ggplot(gg_known_novel_canon)) + print(gg_known_novel_canon) } ``` @@ -1517,7 +1512,7 @@ if (exists("gg_sj_type_by_category_stack")) { ```{r sj-dist-readscanon, fig.height=8, fig.width=12} if (exists("gg_allcanon_by_category")) { - print(plotly_to_ggplot(gg_allcanon_by_category)) + print(gg_allcanon_by_category) } ``` @@ -1525,47 +1520,47 @@ if (exists("gg_allcanon_by_category")) { ## RT-switching -```{=html} - +```{r sjchar-rts-section, results='asis'} +if (exists("mode") && mode == "isoforms") { + # Isoforms mode: only one figure, show directly without dropdown + cat("\n") +} else { + # Reads mode: show full two-option dropdown + htmltools::HTML(paste0( + '\n' + )) +} ``` -
- ```{r sj-rts-all, fig.height=8, fig.width=12} if (exists("gg_rts_all_by_sjtype")) { - print(plotly_to_ggplot(gg_rts_all_by_sjtype)) + print(gg_rts_all_by_sjtype) } ``` -
- -
- -```{r sj-rts-uniq, fig.height=8, fig.width=12} +```{r sj-rts-uniq, fig.height=8, fig.width=12, eval=(!exists('mode') || mode != 'isoforms')} if (exists("gg_rts_unique_by_sjtype")) { - print(plotly_to_ggplot(gg_rts_unique_by_sjtype)) + print(gg_rts_unique_by_sjtype) } ``` -
- # Features of Bad Quality {.tabset .tabset-fade} ## All `r entity_label_plural` ```{r badq-allreads, fig.height=8, fig.width=12} if (exists("gg_bad_feature")) { - print(plotly_to_ggplot(gg_bad_feature)) + print(gg_bad_feature) } ``` @@ -1591,7 +1586,7 @@ if (exists("gg_bad_feature")) { ```{r badfeatures-intrapriming, fig.height=8, fig.width=12} if (exists("gg_intrapriming_by_category")) { - print(plotly_to_ggplot(gg_intrapriming_by_category)) + print(gg_intrapriming_by_category) } ``` @@ -1601,7 +1596,7 @@ if (exists("gg_intrapriming_by_category")) { ```{r badfeatures-rts, fig.height=8, fig.width=12} if (exists("gg_RTS_by_category")) { - print(plotly_to_ggplot(gg_RTS_by_category)) + print(gg_RTS_by_category) } ``` @@ -1611,7 +1606,7 @@ if (exists("gg_RTS_by_category")) { ```{r badfeatures-noncanon, fig.height=8, fig.width=12} if (exists("gg_noncanon_by_category")) { - print(plotly_to_ggplot(gg_noncanon_by_category)) + print(gg_noncanon_by_category) } ``` @@ -1621,7 +1616,7 @@ if (exists("gg_noncanon_by_category")) { ```{r badfeatures-nmd, fig.height=8, fig.width=12} if (exists("gg_nmd_by_category")) { - print(plotly_to_ggplot(gg_nmd_by_category)) + print(gg_nmd_by_category) } ``` @@ -1633,7 +1628,7 @@ if (exists("gg_nmd_by_category")) { ```{r goodq-allreads, fig.height=8, fig.width=12} if (exists("gg_good_feature")) { - print(plotly_to_ggplot(gg_good_feature)) + print(gg_good_feature) } ``` @@ -1663,12 +1658,12 @@ htmltools::HTML(paste0( '
\n', ' \n', + " \n", ' \n', - '
\n', - '
\n' + paste(dropdown_items, collapse = "\n"), "\n", + " \n", + "
\n", + "\n" )) ``` @@ -1676,7 +1671,7 @@ htmltools::HTML(paste0( ```{r goodq-tss, fig.height=8, fig.width=12} if (exists("gg_tss_annotation_support")) { - print(plotly_to_ggplot(gg_tss_annotation_support)) + print(gg_tss_annotation_support) } ``` @@ -1686,7 +1681,7 @@ if (exists("gg_tss_annotation_support")) { ```{r goodq-cage, fig.height=8, fig.width=12} if (exists("CAGE_peak") && CAGE_peak && exists("gg_cage_peak_support")) { - print(plotly_to_ggplot(gg_cage_peak_support)) + print(gg_cage_peak_support) } ``` @@ -1696,7 +1691,7 @@ if (exists("CAGE_peak") && CAGE_peak && exists("gg_cage_peak_support")) { ```{r goodq-polya, fig.height=8, fig.width=12} if (exists("polyA_motif_list") && polyA_motif_list && exists("gg_polyA_motif_support")) { - print(plotly_to_ggplot(gg_polyA_motif_support)) + print(gg_polyA_motif_support) } ``` @@ -1738,7 +1733,7 @@ $(document).ready(function() { ```{r goodq-canon, fig.height=8, fig.width=12} if (exists("gg_canon_by_category")) { - print(plotly_to_ggplot(gg_canon_by_category)) + print(gg_canon_by_category) } ``` @@ -1748,7 +1743,7 @@ if (exists("gg_canon_by_category")) { ```{r goodq-sr-validation, fig.height=8, fig.width=12} if (exists("gg_sr_support_by_category")) { - print(plotly_to_ggplot(gg_sr_support_by_category)) + print(gg_sr_support_by_category) } ``` @@ -1758,7 +1753,7 @@ if (exists("gg_sr_support_by_category")) { ```{r goodq-tss-validation, fig.height=8, fig.width=12} if (exists("gg_tss_validation_by_category")) { - print(plotly_to_ggplot(gg_tss_validation_by_category)) + print(gg_tss_validation_by_category) } ``` @@ -1787,14 +1782,7 @@ if (show_umap) cat('
\n') ``` ```{r clustering-umap, fig.height=6, fig.width=8, eval=show_umap} -if (knitr::is_html_output()) { - # Add margin to avoid title overlap with mode bar in HTML - render_ggplotly(gg_umap, target_width = 800, target_height = 700) %>% - layout(margin = list(t = 100)) %>% plotly::toWebGL() -} else { - # For PDF, render the ggplot object directly - print(gg_umap) -} +print(gg_umap) ``` ```{r clustering-umap-div-end, results='asis'} @@ -1802,13 +1790,114 @@ if (show_umap) cat("
\n") ``` ```{r clustering-cat-header, results='asis'} -if (show_umap && exists("gg_umap_by_category")) { - cat("## Structural Categories\n") +show_umap_cat <- (exists("show_umap") && show_umap && exists("gg_umap_by_category")) +show_cat_cluster <- (exists("gg_cat_cluster_plots") && length(gg_cat_cluster_plots) > 0) + +if (show_umap_cat || show_cat_cluster) { + cat("## Structural Categories {.tabset}\n") } ``` ```{r clustering-cat-check, include=FALSE} -show_umap_cat <- (show_umap && exists("gg_umap_by_category")) +# Variables already defined above, kept for backwards compatibility if needed elsewhere +``` + +```{r cat-dist-tab, results='asis'} +if (show_cat_cluster) { + cat("\n### Distributions\n") +} +``` + +```{=html} + +``` + + +
+```{r cat-cluster-FSM, fig.height=6, fig.width=10, eval=show_cat_cluster, out.width="100%"} +if (show_cat_cluster && !is.null(gg_cat_cluster_plots[["FSM"]])) { + print(gg_cat_cluster_plots[["FSM"]]) +} +``` +
+
+```{r cat-cluster-ISM, fig.height=6, fig.width=10, eval=show_cat_cluster, out.width="100%"} +if (show_cat_cluster && !is.null(gg_cat_cluster_plots[["ISM"]])) { + print(gg_cat_cluster_plots[["ISM"]]) +} +``` +
+
+```{r cat-cluster-NIC, fig.height=6, fig.width=10, eval=show_cat_cluster, out.width="100%"} +if (show_cat_cluster && !is.null(gg_cat_cluster_plots[["NIC"]])) { + print(gg_cat_cluster_plots[["NIC"]]) +} +``` +
+
+```{r cat-cluster-NNC, fig.height=6, fig.width=10, eval=show_cat_cluster, out.width="100%"} +if (show_cat_cluster && !is.null(gg_cat_cluster_plots[["NNC"]])) { + print(gg_cat_cluster_plots[["NNC"]]) +} +``` +
+
+```{r cat-cluster-Genic_Genomic_cat, fig.height=6, fig.width=10, eval=show_cat_cluster, out.width="100%"} +if (show_cat_cluster && !is.null(gg_cat_cluster_plots[["Genic Genomic"]])) { + print(gg_cat_cluster_plots[["Genic Genomic"]]) +} +``` +
+
+```{r cat-cluster-Antisense, fig.height=6, fig.width=10, eval=show_cat_cluster, out.width="100%"} +if (show_cat_cluster && !is.null(gg_cat_cluster_plots[["Antisense"]])) { + print(gg_cat_cluster_plots[["Antisense"]]) +} +``` +
+
+```{r cat-cluster-Fusion, fig.height=6, fig.width=10, eval=show_cat_cluster, out.width="100%"} +if (show_cat_cluster && !is.null(gg_cat_cluster_plots[["Fusion"]])) { + print(gg_cat_cluster_plots[["Fusion"]]) +} +``` +
+
+```{r cat-cluster-Intergenic, fig.height=6, fig.width=10, eval=show_cat_cluster, out.width="100%"} +if (show_cat_cluster && !is.null(gg_cat_cluster_plots[["Intergenic"]])) { + print(gg_cat_cluster_plots[["Intergenic"]]) +} +``` +
+
+```{r cat-cluster-Genic_Intron_cat, fig.height=6, fig.width=10, eval=show_cat_cluster, out.width="100%"} +if (show_cat_cluster && !is.null(gg_cat_cluster_plots[["Genic Intron"]])) { + print(gg_cat_cluster_plots[["Genic Intron"]]) +} +``` +
+ +```{r umap-cat-tab, results='asis'} +if (show_umap_cat) { + cat("\n### UMAPs\n") +} ``` ```{=html} @@ -1834,91 +1923,43 @@ show_umap_cat <- (show_umap && exists("gg_umap_by_category")) @@ -1932,9 +1973,7 @@ if (show_umap_cat) cat('
\n')
```{r umap-cat-FSM, fig.height=6, fig.width=8, eval=show_umap_cat} if (show_umap_cat && !is.null(gg_umap_by_category[["FSM"]])) { - render_ggplotly(gg_umap_by_category[["FSM"]], target_width = 800, target_height = 700) %>% - layout(margin = list(t = 100)) %>% - fix_umap_plotly() + print(gg_umap_by_category[["FSM"]]) } ```
@@ -1942,9 +1981,7 @@ if (show_umap_cat && !is.null(gg_umap_by_category[["FSM"]])) {
```{r umap-cat-ISM, fig.height=6, fig.width=8, eval=show_umap_cat} if (show_umap_cat && !is.null(gg_umap_by_category[["ISM"]])) { - render_ggplotly(gg_umap_by_category[["ISM"]], target_width = 800, target_height = 700) %>% - layout(margin = list(t = 100)) %>% - fix_umap_plotly() + print(gg_umap_by_category[["ISM"]]) } ```
@@ -1952,9 +1989,7 @@ if (show_umap_cat && !is.null(gg_umap_by_category[["ISM"]])) {
```{r umap-cat-NIC, fig.height=6, fig.width=8, eval=show_umap_cat} if (show_umap_cat && !is.null(gg_umap_by_category[["NIC"]])) { - render_ggplotly(gg_umap_by_category[["NIC"]], target_width = 800, target_height = 700) %>% - layout(margin = list(t = 100)) %>% - fix_umap_plotly() + print(gg_umap_by_category[["NIC"]]) } ```
@@ -1962,9 +1997,7 @@ if (show_umap_cat && !is.null(gg_umap_by_category[["NIC"]])) {
```{r umap-cat-NNC, fig.height=6, fig.width=8, eval=show_umap_cat} if (show_umap_cat && !is.null(gg_umap_by_category[["NNC"]])) { - render_ggplotly(gg_umap_by_category[["NNC"]], target_width = 800, target_height = 700) %>% - layout(margin = list(t = 100)) %>% - fix_umap_plotly() + print(gg_umap_by_category[["NNC"]]) } ```
@@ -1972,9 +2005,7 @@ if (show_umap_cat && !is.null(gg_umap_by_category[["NNC"]])) {
```{r umap-cat-GenicGenomic, fig.height=6, fig.width=8, eval=show_umap_cat} if (show_umap_cat && !is.null(gg_umap_by_category[["Genic Genomic"]])) { - render_ggplotly(gg_umap_by_category[["Genic Genomic"]], target_width = 800, target_height = 700) %>% - layout(margin = list(t = 100)) %>% - fix_umap_plotly() + print(gg_umap_by_category[["Genic Genomic"]]) } ```
@@ -1982,9 +2013,7 @@ if (show_umap_cat && !is.null(gg_umap_by_category[["Genic Genomic"]])) {
```{r umap-cat-Antisense, fig.height=6, fig.width=8, eval=show_umap_cat} if (show_umap_cat && !is.null(gg_umap_by_category[["Antisense"]])) { - render_ggplotly(gg_umap_by_category[["Antisense"]], target_width = 800, target_height = 700) %>% - layout(margin = list(t = 100)) %>% - fix_umap_plotly() + print(gg_umap_by_category[["Antisense"]]) } ```
@@ -1992,9 +2021,7 @@ if (show_umap_cat && !is.null(gg_umap_by_category[["Antisense"]])) {
```{r umap-cat-Fusion, fig.height=6, fig.width=8, eval=show_umap_cat} if (show_umap_cat && !is.null(gg_umap_by_category[["Fusion"]])) { - render_ggplotly(gg_umap_by_category[["Fusion"]], target_width = 800, target_height = 700) %>% - layout(margin = list(t = 100)) %>% - fix_umap_plotly() + print(gg_umap_by_category[["Fusion"]]) } ```
@@ -2002,9 +2029,7 @@ if (show_umap_cat && !is.null(gg_umap_by_category[["Fusion"]])) {
```{r umap-cat-Intergenic, fig.height=6, fig.width=8, eval=show_umap_cat} if (show_umap_cat && !is.null(gg_umap_by_category[["Intergenic"]])) { - render_ggplotly(gg_umap_by_category[["Intergenic"]], target_width = 800, target_height = 700) %>% - layout(margin = list(t = 100)) %>% - fix_umap_plotly() + print(gg_umap_by_category[["Intergenic"]]) } ```
@@ -2012,9 +2037,7 @@ if (show_umap_cat && !is.null(gg_umap_by_category[["Intergenic"]])) {
```{r umap-cat-GenicIntron, fig.height=6, fig.width=8, eval=show_umap_cat} if (show_umap_cat && !is.null(gg_umap_by_category[["Genic Intron"]])) { - render_ggplotly(gg_umap_by_category[["Genic Intron"]], target_width = 800, target_height = 700) %>% - layout(margin = list(t = 100)) %>% - fix_umap_plotly() + print(gg_umap_by_category[["Genic Intron"]]) } ```
@@ -2023,6 +2046,107 @@ if (show_umap_cat && !is.null(gg_umap_by_category[["Genic Intron"]])) { if (show_umap_cat) cat("
\n") ``` +```{r len-cluster-header, results='asis'} +show_len_cluster <- (exists("gg_len_cluster_plots") && length(gg_len_cluster_plots) > 0) +if (show_len_cluster) { + cat("\n## Lengths\n") +} +``` + +```{=html} + +``` + +
+```{r len-cluster-All, fig.height=6, fig.width=10, eval=show_len_cluster, out.width="100%"} +all_lbl <- if (mode == "isoforms") "All Transcripts" else "All Reads" +if (show_len_cluster && !is.null(gg_len_cluster_plots[[all_lbl]])) { + print(gg_len_cluster_plots[[all_lbl]]) +} +``` +
+
+```{r len-cluster-FSM, fig.height=6, fig.width=10, eval=show_len_cluster, out.width="100%"} +if (show_len_cluster && !is.null(gg_len_cluster_plots[["FSM"]])) { + print(gg_len_cluster_plots[["FSM"]]) +} +``` +
+
+```{r len-cluster-ISM, fig.height=6, fig.width=10, eval=show_len_cluster, out.width="100%"} +if (show_len_cluster && !is.null(gg_len_cluster_plots[["ISM"]])) { + print(gg_len_cluster_plots[["ISM"]]) +} +``` +
+
+```{r len-cluster-NIC, fig.height=6, fig.width=10, eval=show_len_cluster, out.width="100%"} +if (show_len_cluster && !is.null(gg_len_cluster_plots[["NIC"]])) { + print(gg_len_cluster_plots[["NIC"]]) +} +``` +
+
+```{r len-cluster-NNC, fig.height=6, fig.width=10, eval=show_len_cluster, out.width="100%"} +if (show_len_cluster && !is.null(gg_len_cluster_plots[["NNC"]])) { + print(gg_len_cluster_plots[["NNC"]]) +} +``` +
+
+```{r len-cluster-GenicGenomic, fig.height=6, fig.width=10, eval=show_len_cluster, out.width="100%"} +if (show_len_cluster && !is.null(gg_len_cluster_plots[["Genic Genomic"]])) { + print(gg_len_cluster_plots[["Genic Genomic"]]) +} +``` +
+
+```{r len-cluster-Antisense, fig.height=6, fig.width=10, eval=show_len_cluster, out.width="100%"} +if (show_len_cluster && !is.null(gg_len_cluster_plots[["Antisense"]])) { + print(gg_len_cluster_plots[["Antisense"]]) +} +``` +
+
+```{r len-cluster-Fusion, fig.height=6, fig.width=10, eval=show_len_cluster, out.width="100%"} +if (show_len_cluster && !is.null(gg_len_cluster_plots[["Fusion"]])) { + print(gg_len_cluster_plots[["Fusion"]]) +} +``` +
+
+```{r len-cluster-Intergenic, fig.height=6, fig.width=10, eval=show_len_cluster, out.width="100%"} +if (show_len_cluster && !is.null(gg_len_cluster_plots[["Intergenic"]])) { + print(gg_len_cluster_plots[["Intergenic"]]) +} +``` +
+
+```{r len-cluster-GenicIntron, fig.height=6, fig.width=10, eval=show_len_cluster, out.width="100%"} +if (show_len_cluster && !is.null(gg_len_cluster_plots[["Genic Intron"]])) { + print(gg_len_cluster_plots[["Genic Intron"]]) +} +``` +
+ ```{r clustering-umap-missing, results='asis'} if (!show_umap) { cat("Clustering and UMAP analysis was not run or results are missing.\n") @@ -2037,7 +2161,6 @@ if (mode != "reads" && exists("gg_sr_cluster_plots") && length(gg_sr_cluster_plo ```{r sr-cluster-check, include=FALSE} show_sr_cluster <- (exists("gg_sr_cluster_plots") && length(gg_sr_cluster_plots) > 0) - ``` ```{r sr-dist-tab, results='asis'} @@ -2071,79 +2194,79 @@ if (show_sr_cluster) {
```{r sr-cluster-Global, fig.height=6, fig.width=10, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_cluster_plots[["All Transcripts"]])) { - print(plotly_to_ggplot(gg_sr_cluster_plots[["All Transcripts"]])) + print(gg_sr_cluster_plots[["All Transcripts"]]) } ```
```{r sr-cluster-FSM, fig.height=6, fig.width=10, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_cluster_plots[["FSM"]])) { - print(plotly_to_ggplot(gg_sr_cluster_plots[["FSM"]])) + print(gg_sr_cluster_plots[["FSM"]]) } ```
```{r sr-cluster-ISM, fig.height=6, fig.width=10, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_cluster_plots[["ISM"]])) { - print(plotly_to_ggplot(gg_sr_cluster_plots[["ISM"]])) + print(gg_sr_cluster_plots[["ISM"]]) } ```
```{r sr-cluster-NIC, fig.height=6, fig.width=10, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_cluster_plots[["NIC"]])) { - print(plotly_to_ggplot(gg_sr_cluster_plots[["NIC"]])) + print(gg_sr_cluster_plots[["NIC"]]) } ```
```{r sr-cluster-NNC, fig.height=6, fig.width=10, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_cluster_plots[["NNC"]])) { - print(plotly_to_ggplot(gg_sr_cluster_plots[["NNC"]])) + print(gg_sr_cluster_plots[["NNC"]]) } ```
```{r sr-cluster-Genic_Genomic, fig.height=6, fig.width=10, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_cluster_plots[["Genic Genomic"]])) { - print(plotly_to_ggplot(gg_sr_cluster_plots[["Genic Genomic"]])) + print(gg_sr_cluster_plots[["Genic Genomic"]]) } ```
```{r sr-cluster-Antisense, fig.height=6, fig.width=10, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_cluster_plots[["Antisense"]])) { - print(plotly_to_ggplot(gg_sr_cluster_plots[["Antisense"]])) + print(gg_sr_cluster_plots[["Antisense"]]) } ```
```{r sr-cluster-Fusion, fig.height=6, fig.width=10, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_cluster_plots[["Fusion"]])) { - print(plotly_to_ggplot(gg_sr_cluster_plots[["Fusion"]])) + print(gg_sr_cluster_plots[["Fusion"]]) } ```
```{r sr-cluster-Intergenic, fig.height=6, fig.width=10, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_cluster_plots[["Intergenic"]])) { - print(plotly_to_ggplot(gg_sr_cluster_plots[["Intergenic"]])) + print(gg_sr_cluster_plots[["Intergenic"]]) } ```
```{r sr-cluster-Genic_Intron, fig.height=6, fig.width=10, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_cluster_plots[["Genic Intron"]])) { - print(plotly_to_ggplot(gg_sr_cluster_plots[["Genic Intron"]])) + print(gg_sr_cluster_plots[["Genic Intron"]]) } ```
```{r sr-umap-tab-and-dropdown, results='asis'} - if (show_sr_cluster && exists("gg_sr_umap_plots") && length(gg_sr_umap_plots) > 0) { - cat("\n### UMAPs\n") - - cat(' +if (show_sr_cluster && exists("gg_sr_umap_plots") && length(gg_sr_umap_plots) > 0) { + cat("\n### UMAPs\n") + + cat(' ') - } +} ```
```{r sr-umap-Global, fig.height=6, fig.width=8, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_umap_plots[["All Transcripts"]])) { - render_ggplotly(gg_sr_umap_plots[["All Transcripts"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_sr_umap_plots[["All Transcripts"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
@@ -2180,9 +2305,11 @@ if (show_sr_cluster && !is.null(gg_sr_umap_plots[["All Transcripts"]])) {
```{r sr-umap-FSM, fig.height=6, fig.width=8, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_umap_plots[["FSM"]])) { - render_ggplotly(gg_sr_umap_plots[["FSM"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_sr_umap_plots[["FSM"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
@@ -2190,9 +2317,11 @@ if (show_sr_cluster && !is.null(gg_sr_umap_plots[["FSM"]])) {
```{r sr-umap-ISM, fig.height=6, fig.width=8, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_umap_plots[["ISM"]])) { - render_ggplotly(gg_sr_umap_plots[["ISM"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_sr_umap_plots[["ISM"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
@@ -2200,9 +2329,11 @@ if (show_sr_cluster && !is.null(gg_sr_umap_plots[["ISM"]])) {
```{r sr-umap-NIC, fig.height=6, fig.width=8, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_umap_plots[["NIC"]])) { - render_ggplotly(gg_sr_umap_plots[["NIC"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_sr_umap_plots[["NIC"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
@@ -2210,9 +2341,11 @@ if (show_sr_cluster && !is.null(gg_sr_umap_plots[["NIC"]])) {
```{r sr-umap-NNC, fig.height=6, fig.width=8, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_umap_plots[["NNC"]])) { - render_ggplotly(gg_sr_umap_plots[["NNC"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_sr_umap_plots[["NNC"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
@@ -2220,9 +2353,11 @@ if (show_sr_cluster && !is.null(gg_sr_umap_plots[["NNC"]])) {
```{r sr-umap-Genic_Genomic, fig.height=6, fig.width=8, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_umap_plots[["Genic Genomic"]])) { - render_ggplotly(gg_sr_umap_plots[["Genic Genomic"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_sr_umap_plots[["Genic Genomic"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
@@ -2230,9 +2365,11 @@ if (show_sr_cluster && !is.null(gg_sr_umap_plots[["Genic Genomic"]])) {
```{r sr-umap-Antisense, fig.height=6, fig.width=8, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_umap_plots[["Antisense"]])) { - render_ggplotly(gg_sr_umap_plots[["Antisense"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_sr_umap_plots[["Antisense"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
@@ -2240,9 +2377,11 @@ if (show_sr_cluster && !is.null(gg_sr_umap_plots[["Antisense"]])) {
```{r sr-umap-Fusion, fig.height=6, fig.width=8, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_umap_plots[["Fusion"]])) { - render_ggplotly(gg_sr_umap_plots[["Fusion"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_sr_umap_plots[["Fusion"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
@@ -2250,9 +2389,11 @@ if (show_sr_cluster && !is.null(gg_sr_umap_plots[["Fusion"]])) {
```{r sr-umap-Intergenic, fig.height=6, fig.width=8, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_umap_plots[["Intergenic"]])) { - render_ggplotly(gg_sr_umap_plots[["Intergenic"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_sr_umap_plots[["Intergenic"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
@@ -2260,15 +2401,15 @@ if (show_sr_cluster && !is.null(gg_sr_umap_plots[["Intergenic"]])) {
```{r sr-umap-Genic_Intron, fig.height=6, fig.width=8, eval=show_sr_cluster, out.width="100%"} if (show_sr_cluster && !is.null(gg_sr_umap_plots[["Genic Intron"]])) { - render_ggplotly(gg_sr_umap_plots[["Genic Intron"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_sr_umap_plots[["Genic Intron"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
- - ```{r tss-cluster-header, results='asis'} if (mode != "reads" && exists("gg_tss_cluster_plots") && length(gg_tss_cluster_plots) > 0) { cat("## TSS Coverage by Short Reads {.tabset}\n") @@ -2310,7 +2451,7 @@ if (show_tss_cluster) {
```{r tss-cluster-Global, fig.height=6, fig.width=10, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["All Transcripts"]])) { - print(plotly_to_ggplot(gg_tss_cluster_plots[["All Transcripts"]])) + print(gg_tss_cluster_plots[["All Transcripts"]]) } ```
@@ -2318,7 +2459,7 @@ if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["All Transcripts"]])) {
```{r tss-cluster-FSM, fig.height=6, fig.width=10, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["FSM"]])) { - print(plotly_to_ggplot(gg_tss_cluster_plots[["FSM"]])) + print(gg_tss_cluster_plots[["FSM"]]) } ```
@@ -2326,7 +2467,7 @@ if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["FSM"]])) {
```{r tss-cluster-ISM, fig.height=6, fig.width=10, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["ISM"]])) { - print(plotly_to_ggplot(gg_tss_cluster_plots[["ISM"]])) + print(gg_tss_cluster_plots[["ISM"]]) } ```
@@ -2334,7 +2475,7 @@ if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["ISM"]])) {
```{r tss-cluster-NIC, fig.height=6, fig.width=10, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["NIC"]])) { - print(plotly_to_ggplot(gg_tss_cluster_plots[["NIC"]])) + print(gg_tss_cluster_plots[["NIC"]]) } ```
@@ -2342,7 +2483,7 @@ if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["NIC"]])) {
```{r tss-cluster-NNC, fig.height=6, fig.width=10, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["NNC"]])) { - print(plotly_to_ggplot(gg_tss_cluster_plots[["NNC"]])) + print(gg_tss_cluster_plots[["NNC"]]) } ```
@@ -2350,7 +2491,7 @@ if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["NNC"]])) {
```{r tss-cluster-Genic_Genomic, fig.height=6, fig.width=10, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["Genic Genomic"]])) { - print(plotly_to_ggplot(gg_tss_cluster_plots[["Genic Genomic"]])) + print(gg_tss_cluster_plots[["Genic Genomic"]]) } ```
@@ -2358,7 +2499,7 @@ if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["Genic Genomic"]])) {
```{r tss-cluster-Antisense, fig.height=6, fig.width=10, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["Antisense"]])) { - print(plotly_to_ggplot(gg_tss_cluster_plots[["Antisense"]])) + print(gg_tss_cluster_plots[["Antisense"]]) } ```
@@ -2366,7 +2507,7 @@ if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["Antisense"]])) {
```{r tss-cluster-Fusion, fig.height=6, fig.width=10, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["Fusion"]])) { - print(plotly_to_ggplot(gg_tss_cluster_plots[["Fusion"]])) + print(gg_tss_cluster_plots[["Fusion"]]) } ```
@@ -2374,16 +2515,16 @@ if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["Fusion"]])) {
```{r tss-cluster-Intergenic, fig.height=6, fig.width=10, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_cluster_plots[["Intergenic"]])) { - print(plotly_to_ggplot(gg_tss_cluster_plots[["Intergenic"]])) + print(gg_tss_cluster_plots[["Intergenic"]]) } ```
```{r tss-umap-tab-and-dropdown, results='asis'} - if (show_tss_cluster && exists("gg_tss_umap_plots") && length(gg_tss_umap_plots) > 0) { - cat("\n### UMAPs\n") - - cat(' +if (show_tss_cluster && exists("gg_tss_umap_plots") && length(gg_tss_umap_plots) > 0) { + cat("\n### UMAPs\n") + + cat(' ') - } +} ```
```{r tss-umap-Global, fig.height=6, fig.width=8, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_umap_plots[["All Transcripts"]])) { - render_ggplotly(gg_tss_umap_plots[["All Transcripts"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_tss_umap_plots[["All Transcripts"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
@@ -2420,81 +2563,99 @@ if (show_tss_cluster && !is.null(gg_tss_umap_plots[["All Transcripts"]])) {
```{r tss-umap-FSM, fig.height=6, fig.width=8, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_umap_plots[["FSM"]])) { - render_ggplotly(gg_tss_umap_plots[["FSM"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_tss_umap_plots[["FSM"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
```{r tss-umap-ISM, fig.height=6, fig.width=8, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_umap_plots[["ISM"]])) { - render_ggplotly(gg_tss_umap_plots[["ISM"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_tss_umap_plots[["ISM"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
```{r tss-umap-NIC, fig.height=6, fig.width=8, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_umap_plots[["NIC"]])) { - render_ggplotly(gg_tss_umap_plots[["NIC"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_tss_umap_plots[["NIC"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
```{r tss-umap-NNC, fig.height=6, fig.width=8, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_umap_plots[["NNC"]])) { - render_ggplotly(gg_tss_umap_plots[["NNC"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_tss_umap_plots[["NNC"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
```{r tss-umap-Genic_Genomic, fig.height=6, fig.width=8, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_umap_plots[["Genic Genomic"]])) { - render_ggplotly(gg_tss_umap_plots[["Genic Genomic"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_tss_umap_plots[["Genic Genomic"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
```{r tss-umap-Antisense, fig.height=6, fig.width=8, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_umap_plots[["Antisense"]])) { - render_ggplotly(gg_tss_umap_plots[["Antisense"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_tss_umap_plots[["Antisense"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
```{r tss-umap-Fusion, fig.height=6, fig.width=8, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_umap_plots[["Fusion"]])) { - render_ggplotly(gg_tss_umap_plots[["Fusion"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_tss_umap_plots[["Fusion"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
```{r tss-umap-Intergenic, fig.height=6, fig.width=8, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_umap_plots[["Intergenic"]])) { - render_ggplotly(gg_tss_umap_plots[["Intergenic"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_tss_umap_plots[["Intergenic"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
```{r tss-umap-Genic_Intron, fig.height=6, fig.width=8, eval=show_tss_cluster, out.width="100%"} if (show_tss_cluster && !is.null(gg_tss_umap_plots[["Genic Intron"]])) { - render_ggplotly(gg_tss_umap_plots[["Genic Intron"]], target_width = 800, target_height = 700) %>% fix_umap_plotly() + gg_tss_umap_plots[["Genic Intron"]] } else { - print(ggplot2::ggplot() + ggplot2::annotate("text", x=0.5, y=0.5, label="No data available") + ggplot2::theme_void()) + print(ggplot2::ggplot() + + ggplot2::annotate("text", x = 0.5, y = 0.5, label = "No data available") + + ggplot2::theme_void()) } ```
\ No newline at end of file diff --git a/src/report_assets/style-multisample.css b/src/report_assets/style-multisample.css index b4e42bf..9c91e60 100644 --- a/src/report_assets/style-multisample.css +++ b/src/report_assets/style-multisample.css @@ -1,473 +1,418 @@ -body { - font-size: 14px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - line-height: 1.6; - color: #2c3e50; -} - -td { - font-size: 14px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; -} - -h1 { - font-size: 32px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - color: #2c3e50; - padding-top: 20px; - border-bottom: 3px solid #3498db; - padding-bottom: 10px; - margin-top: 0; -} - -h2 { - font-size: 24px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - color: #2c3e50; - padding-top: 30px; - border-bottom: 2px solid #ecf0f1; - padding-bottom: 8px; - margin-top: 0; -} - -h3 { - font-size: 20px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - color: #34495e; - padding-top: 20px; - margin-top: 0; -} - -h4 { - font-size: 16px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - color: #7f8c8d; - font-style: italic; - padding-top: 0; - margin-top: 0; -} - -/* TOC */ -#TOC { - background-color: #f8f9fa; - border: 1px solid #dee2e6; - border-radius: 8px; - padding: 20px; - margin-bottom: 30px; -} - -#TOC ul { - list-style-type: none; - padding-left: 0; -} - -#TOC ul ul { - padding-left: 20px; -} - -#TOC a { - color: #495057; - text-decoration: none; - display: block; - padding: 2px 0; -} - -#TOC a:hover { - color: #3498db; - text-decoration: underline; -} - -/* Tables */ -table { - border-collapse: collapse; - margin: 20px 0; - min-width: 300px; - font-size: 14px; - border-radius: 8px; - overflow: hidden; - box-shadow: 0 0 20px rgba(0, 0, 0, 0.08); -} - -table thead tr { - background-color: #3498db; - color: #ffffff; - text-align: left; - font-weight: bold; -} - -table th, -table td { - padding: 12px 15px; - border: 1px solid #dddddd; -} - -table tbody tr { - border-bottom: 1px solid #dddddd; -} - -table tbody tr:nth-of-type(even) { - background-color: #f8f9fa; -} - -table tbody tr:last-of-type { - border-bottom: 2px solid #3498db; -} - -table tbody tr:hover { - background-color: #e8f4f8; -} - -/* Tabs */ -.nav-tabs { - border-bottom: 2px solid #ecf0f1; - margin-bottom: 20px; -} - -.nav-tabs > li > a { - color: #495057; - border: none; - border-radius: 8px 8px 0 0; - padding: 12px 20px; - margin-right: 5px; - background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); - font-weight: 500; - font-size: 14px; - transition: all 0.3s ease; - box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); - line-height: 1.4; -} - -.nav-tabs > li > a:hover { - background: linear-gradient(135deg, #e9ecef 0%, #dee2e6 100%); - color: #2c3e50; - box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15); -} - -.nav-tabs > li.active > a, -.nav-tabs > li.active > a:focus, -.nav-tabs > li.active > a:hover { - color: #ffffff; - background: linear-gradient(135deg, #3498db 0%, #2980b9 100%); - font-weight: bold; - border: none; - box-shadow: 0 4px 12px rgba(52, 152, 219, 0.3); -} - -.tab-content { - margin-top: 20px; -} - -.tab-pane { - display: none; -} - -.tab-pane.active, -.tab-pane.show.active { - display: block; -} - -.toggle-pane { - width: 100%; -} - -.hidden-pane { - display: none !important; -} - -/* Dropdown toolbar */ -.dropdown-toolbar { - display: block; - text-align: right; - margin: 0 0 10px 0; -} - -.dropdown-toolbar .btn { - min-width: 220px; - text-align: left; -} - -.dropdown-menu > li > a { - cursor: pointer; -} - -.dt-button { - background-color: #95a5a6 !important; - border: none !important; - color: #ffffff !important; - border-radius: 6px !important; - padding: 6px 14px !important; - font-size: 13px !important; - margin-right: 6px !important; -} - -.dt-button:hover, -.dt-button:active, -.dt-button:focus { - background-color: #7f8c8d !important; - color: #ffffff !important; - box-shadow: none !important; -} - -.dataTables_wrapper .dataTables_paginate .paginate_button { - border: none !important; - background: transparent !important; - color: inherit !important; - margin: 0 !important; - padding: 0 !important; -} - -.dataTables_wrapper .dataTables_paginate ul.pagination { - display: inline-flex; - gap: 6px; -} - -.dataTables_wrapper .dataTables_paginate .paginate_button a, -.dataTables_wrapper .dataTables_paginate .paginate_button span { - background: #95a5a6 !important; - color: #ffffff !important; - border-radius: 6px !important; - padding: 6px 14px !important; - border: none !important; - display: inline-block; -} - -.dataTables_wrapper .dataTables_paginate .paginate_button.current, -.dataTables_wrapper .dataTables_paginate .paginate_button.current:hover { - background: transparent !important; -} - -.dataTables_wrapper .dataTables_paginate .paginate_button.current a, -.dataTables_wrapper .dataTables_paginate .paginate_button.current:hover a { - background: #7f8c8d !important; - color: #ffffff !important; -} - -.dataTables_wrapper .dataTables_paginate .paginate_button:hover { - background: transparent !important; -} - -.dataTables_wrapper .dataTables_paginate .paginate_button:hover a { - background: #7f8c8d !important; - color: #ffffff !important; -} - -.dataTables_wrapper .dataTables_paginate .paginate_button.disabled, -.dataTables_wrapper .dataTables_paginate .paginate_button.disabled:hover { - background: transparent !important; - color: inherit !important; -} - -.dataTables_wrapper .dataTables_paginate .paginate_button.disabled a, -.dataTables_wrapper .dataTables_paginate .paginate_button.disabled:hover a { - background: #bdc3c7 !important; - color: #ecf0f1 !important; - cursor: not-allowed !important; -} - -/* Title and metadata */ -.title, -h1.title { - font-size: 36px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - color: #e74c3c; - text-align: center; - margin-bottom: 30px; -} - -.date, -p.date { - color: #7f8c8d; - font-size: 16px; - font-style: italic; - text-align: left; - margin-bottom: 32px; -} - -.section-spacer-xl { - height: clamp(40px, 6vw, 120px); - margin: 0; -} - -.main-container { - max-width: 1300px; - width: 100%; - margin: 0 auto; - padding-left: 24px; - padding-right: 24px; - box-sizing: border-box; -} - -/* Plot sizing */ -.section .html-widget, -.section .js-plotly-plot, -.toggle-pane .html-widget, -.toggle-pane .js-plotly-plot, -.plotly.html-widget, -.plotly.html-widget > div, -.js-plotly-plot .plot-container, -.js-plotly-plot .svg-container, -.html-widget-static-bound { - width: auto !important; - max-width: 100% !important; - margin-left: auto; - margin-right: auto; -} - -.plot-block-with-dropdown { - position: relative; - padding-top: 88px; - margin-bottom: 48px; -} - -.plot-block-with-dropdown > .dropdown-toolbar { - display: flex; - justify-content: flex-end; - gap: 12px; - position: absolute; - top: 8px; - right: 0; - margin: 0; -} - -.plot-block-with-dropdown .html-widget, -.plot-block-with-dropdown .js-plotly-plot, -.plot-block-with-dropdown .plotly.html-widget > div { - width: auto !important; - max-width: 100% !important; - margin-left: auto; - margin-right: auto; - margin-top: 18px; -} - -.plot-block-with-dropdown--structural { - margin-bottom: clamp(48px, calc(4vw + 1.5rem), 88px); -} - -#pca-analysis { - margin-top: 72px; - padding-top: 20px; -} - -/* Plotly tweaks */ -.html-widget, -.js-plotly-plot, -.plot-container, -.svg-container { - background: transparent !important; - border: 0 !important; - box-shadow: none !important; - outline: none !important; -} - -.js-plotly-plot .modebar, -.js-plotly-plot .modebar-group, -.js-plotly-plot .modebar-btn { - background: transparent !important; - box-shadow: none !important; -} - -.js-plotly-plot .modebar { - top: -18px !important; -} - -.js-plotly-plot .modebar-btn svg, -.js-plotly-plot .modebar-btn path { - fill: #6c757d !important; - color: #6c757d !important; -} - -.pca-pane { - margin-top: 36px; -} - -.pca-pane .plotly.html-widget, -.pca-pane .plotly.html-widget > div, -.pca-pane .js-plotly-plot { - margin-top: 22px; -} - -/* Utility colors */ -.sqanti-primary { color: #3498db; } -.sqanti-secondary { color: #e74c3c; } -.sqanti-success { color: #27ae60; } -.sqanti-warning { color: #f39c12; } - -@media (max-width: 992px) { - body { - font-size: 13px; - } - h1 { - font-size: 28px; - } - h2 { - font-size: 22px; - } - .main-container { - padding-left: 16px; - padding-right: 16px; - } - .plot-block-with-dropdown { - padding-top: 96px; - } -} - -@media (max-width: 768px) { - body { - font-size: 12px; - } - h1 { - font-size: 24px; - } - h2 { - font-size: 20px; - } - h3 { - font-size: 18px; - } - .nav-tabs > li > a { - padding: 10px 14px; - font-size: 13px; - } - table { - font-size: 12px; - } - table th, - table td { - padding: 8px 10px; - } - .dropdown-toolbar { - text-align: left; - } - .plot-block-with-dropdown { - padding-top: 0; - } - .plot-block-with-dropdown > .dropdown-toolbar { - position: static; - margin: 12px 0; - display: block; - } - .plot-block-with-dropdown--structural { - margin-bottom: 80px; - } -} - -@media print { - body { - font-size: 12px; - line-height: 1.4; - } - h1, h2, h3, h4 { - page-break-after: avoid; - } - table { - page-break-inside: avoid; - } - .nav-tabs, - .dropdown-toolbar, - .modebar { - display: none !important; - } - .tab-pane { - display: block !important; - } -} +body { + font-size: 14px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + line-height: 1.6; + color: #2c3e50; +} + +td { + font-size: 14px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; +} + +h1 { + font-size: 32px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + color: #2c3e50; + padding-top: 20px; + border-bottom: 3px solid #3498db; + padding-bottom: 10px; + margin-top: 0; +} + +h2 { + font-size: 24px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + color: #2c3e50; + padding-top: 30px; + border-bottom: 2px solid #ecf0f1; + padding-bottom: 8px; + margin-top: 0; +} + +h3 { + font-size: 20px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + color: #34495e; + padding-top: 20px; + margin-top: 0; +} + +h4 { + font-size: 16px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + color: #7f8c8d; + font-style: italic; + padding-top: 0; + margin-top: 0; +} + +/* TOC */ +#TOC { + background-color: #f8f9fa; + border: 1px solid #dee2e6; + border-radius: 8px; + padding: 20px; + margin-bottom: 30px; +} + +#TOC ul { + list-style-type: none; + padding-left: 0; +} + +#TOC ul ul { + padding-left: 20px; +} + +#TOC a { + color: #495057; + text-decoration: none; + display: block; + padding: 2px 0; +} + +#TOC a:hover { + color: #3498db; + text-decoration: underline; +} + +/* Tables */ +table { + border-collapse: collapse; + margin: 20px 0; + min-width: 300px; + font-size: 14px; + border-radius: 8px; + overflow: hidden; + box-shadow: 0 0 20px rgba(0, 0, 0, 0.08); +} + +table thead tr { + background-color: #3498db; + color: #ffffff; + text-align: left; + font-weight: bold; +} + +table th, +table td { + padding: 12px 15px; + border: 1px solid #dddddd; +} + +table tbody tr { + border-bottom: 1px solid #dddddd; +} + +table tbody tr:nth-of-type(even) { + background-color: #f8f9fa; +} + +table tbody tr:last-of-type { + border-bottom: 2px solid #3498db; +} + +table tbody tr:hover { + background-color: #e8f4f8; +} + +/* Tabs */ +.nav-tabs { + border-bottom: 2px solid #ecf0f1; + margin-bottom: 20px; +} + +.nav-tabs > li > a { + color: #495057; + border: none; + border-radius: 8px 8px 0 0; + padding: 12px 20px; + margin-right: 5px; + background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); + font-weight: 500; + font-size: 14px; + transition: all 0.3s ease; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); + line-height: 1.4; +} + +.nav-tabs > li > a:hover { + background: linear-gradient(135deg, #e9ecef 0%, #dee2e6 100%); + color: #2c3e50; + box-shadow: 0 4px 8px rgba(0, 0, 0, 0.15); +} + +.nav-tabs > li.active > a, +.nav-tabs > li.active > a:focus, +.nav-tabs > li.active > a:hover { + color: #ffffff; + background: linear-gradient(135deg, #3498db 0%, #2980b9 100%); + font-weight: bold; + border: none; + box-shadow: 0 4px 12px rgba(52, 152, 219, 0.3); +} + +.tab-content { + margin-top: 20px; +} + +.tab-pane { + display: none; +} + +.tab-pane.active, +.tab-pane.show.active { + display: block; +} + +.toggle-pane { + width: 100%; +} + +.hidden-pane { + display: none !important; +} + +/* Dropdown toolbar */ +.dropdown-toolbar { + display: block; + text-align: right; + margin: 0 0 10px 0; +} + +.dropdown-toolbar .btn { + min-width: 220px; + text-align: left; +} + +.dropdown-menu > li > a { + cursor: pointer; +} + +.dt-button { + background-color: #95a5a6 !important; + border: none !important; + color: #ffffff !important; + border-radius: 6px !important; + padding: 6px 14px !important; + font-size: 13px !important; + margin-right: 6px !important; +} + +.dt-button:hover, +.dt-button:active, +.dt-button:focus { + background-color: #7f8c8d !important; + color: #ffffff !important; + box-shadow: none !important; +} + +.dataTables_wrapper .dataTables_paginate .paginate_button { + border: none !important; + background: transparent !important; + color: inherit !important; + margin: 0 !important; + padding: 0 !important; +} + +.dataTables_wrapper .dataTables_paginate ul.pagination { + display: inline-flex; + gap: 6px; +} + +.dataTables_wrapper .dataTables_paginate .paginate_button a, +.dataTables_wrapper .dataTables_paginate .paginate_button span { + background: #95a5a6 !important; + color: #ffffff !important; + border-radius: 6px !important; + padding: 6px 14px !important; + border: none !important; + display: inline-block; +} + +.dataTables_wrapper .dataTables_paginate .paginate_button.current, +.dataTables_wrapper .dataTables_paginate .paginate_button.current:hover { + background: transparent !important; +} + +.dataTables_wrapper .dataTables_paginate .paginate_button.current a, +.dataTables_wrapper .dataTables_paginate .paginate_button.current:hover a { + background: #7f8c8d !important; + color: #ffffff !important; +} + +.dataTables_wrapper .dataTables_paginate .paginate_button:hover { + background: transparent !important; +} + +.dataTables_wrapper .dataTables_paginate .paginate_button:hover a { + background: #7f8c8d !important; + color: #ffffff !important; +} + +.dataTables_wrapper .dataTables_paginate .paginate_button.disabled, +.dataTables_wrapper .dataTables_paginate .paginate_button.disabled:hover { + background: transparent !important; + color: inherit !important; +} + +.dataTables_wrapper .dataTables_paginate .paginate_button.disabled a, +.dataTables_wrapper .dataTables_paginate .paginate_button.disabled:hover a { + background: #bdc3c7 !important; + color: #ecf0f1 !important; + cursor: not-allowed !important; +} + +/* Title and metadata */ +.title, +h1.title { + font-size: 36px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + color: #e74c3c; + text-align: center; + margin-bottom: 30px; +} + +.date, +p.date { + color: #7f8c8d; + font-size: 16px; + font-style: italic; + text-align: left; + margin-bottom: 32px; +} + +.section-spacer-xl { + height: clamp(40px, 6vw, 120px); + margin: 0; +} + +.main-container { + max-width: 1300px; + width: 100%; + margin: 0 auto; + padding-left: 24px; + padding-right: 24px; + box-sizing: border-box; +} + +/* Plot sizing */ +.section .html-widget, + +.plot-block-with-dropdown { + position: relative; + padding-top: 88px; + margin-bottom: 48px; +} + +.plot-block-with-dropdown > .dropdown-toolbar { + display: flex; + justify-content: flex-end; + gap: 12px; + position: absolute; + top: 8px; + right: 0; + margin: 0; +} + +.plot-block-with-dropdown .html-widget, + +.plot-block-with-dropdown--structural { + margin-bottom: clamp(48px, calc(4vw + 1.5rem), 88px); +} + +#pca-analysis { + margin-top: 72px; + padding-top: 20px; +} + +.html-widget, + +.pca-pane { + margin-top: 36px; +} + +.sqanti-secondary { color: #e74c3c; } +.sqanti-success { color: #27ae60; } +.sqanti-warning { color: #f39c12; } + +@media (max-width: 992px) { + body { + font-size: 13px; + } + h1 { + font-size: 28px; + } + h2 { + font-size: 22px; + } + .main-container { + padding-left: 16px; + padding-right: 16px; + } + .plot-block-with-dropdown { + padding-top: 96px; + } +} + +@media (max-width: 768px) { + body { + font-size: 12px; + } + h1 { + font-size: 24px; + } + h2 { + font-size: 20px; + } + h3 { + font-size: 18px; + } + .nav-tabs > li > a { + padding: 10px 14px; + font-size: 13px; + } + table { + font-size: 12px; + } + table th, + table td { + padding: 8px 10px; + } + .dropdown-toolbar { + text-align: left; + } + .plot-block-with-dropdown { + padding-top: 0; + } + .plot-block-with-dropdown > .dropdown-toolbar { + position: static; + margin: 12px 0; + display: block; + } + .plot-block-with-dropdown--structural { + margin-bottom: 80px; + } +} + +@media print { + body { + font-size: 12px; + line-height: 1.4; + } + h1, h2, h3, h4 { + page-break-after: avoid; + } + table { + page-break-inside: avoid; + } + .nav-tabs, + .dropdown-toolbar, + .modebar { + display: none !important; + } + .tab-pane { + display: block !important; + } +} diff --git a/src/report_assets/style.css b/src/report_assets/style.css index 1012b3a..26849a4 100644 --- a/src/report_assets/style.css +++ b/src/report_assets/style.css @@ -1,430 +1,456 @@ -body { - font-size: 14px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - line-height: 1.6; -} - -td { - font-size: 14px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; -} - -h1 { - font-size: 32px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - color: #2c3e50; - padding-top: 20px; - border-bottom: 3px solid #3498db; - padding-bottom: 10px; -} - -h2 { - font-size: 24px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - color: #2c3e50; - padding-top: 30px; - border-bottom: 2px solid #ecf0f1; - padding-bottom: 8px; -} - -h3 { - font-size: 20px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - color: #34495e; - padding-top: 20px; -} - -h4 { - font-size: 16px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - color: #7f8c8d; - font-style: italic; -} - -/* Right-aligned dropdown toolbar used above tables */ -.dropdown-toolbar { - display: block; - text-align: right; - margin: 0 0 10px 0; -} - -.dropdown-toolbar .btn { - min-width: 220px; - text-align: left; -} - -.dropdown-menu > li > a { - cursor: pointer; -} - -/* Constrain width of single-plot tabs in Sequencing Depth to improve readability */ -.section.level1:has(> h1:contains("Sequencing Depth")) .tab-content .tab-pane > div.html-widget { - max-width: 900px; - margin-left: auto; - margin-right: auto; -} - -.title { - font-size: 36px; - font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; - color: #e74c3c; - text-align: center; - margin-bottom: 30px; -} - -/* Table styling */ -table { - border-collapse: collapse; - margin: 20px 0; - font-size: 14px; - min-width: 300px; - border-radius: 8px; - overflow: hidden; - box-shadow: 0 0 20px rgba(0, 0, 0, 0.1); -} - -table thead tr { - background-color: #3498db; - color: #ffffff; - text-align: left; - font-weight: bold; -} - -table th, -table td { - padding: 12px 15px; - border: 1px solid #dddddd; -} - -table tbody tr { - border-bottom: 1px solid #dddddd; -} - -table tbody tr:nth-of-type(even) { - background-color: #f8f9fa; -} - -table tbody tr:last-of-type { - border-bottom: 2px solid #3498db; -} - -table tbody tr:hover { - background-color: #e8f4f8; -} - -/* Tab styling */ -.nav-tabs { - border-bottom: 2px solid #ecf0f1; - margin-bottom: 20px; -} - -.nav-tabs > li > a { - color: #495057; - border: none; - border-radius: 8px 8px 0 0; - padding: 12px 20px; - margin-right: 5px; - background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); - font-weight: 500; - font-size: 14px; - transition: all 0.3s ease; - box-shadow: 0 2px 4px rgba(0,0,0,0.1); - line-height: 1.4; -} - -.nav-tabs > li > a:hover { - background: linear-gradient(135deg, #e9ecef 0%, #dee2e6 100%); - color: #2c3e50; -} - -.nav-tabs > li.active > a, -.nav-tabs > li.active > a:hover, -.nav-tabs > li.active > a:focus { - color: white; - background: linear-gradient(135deg, #3498db 0%, #2980b9 100%); - font-weight: bold; - border: none; - box-shadow: 0 4px 12px rgba(52, 152, 219, 0.3); -} - -/* Pill styling */ -.nav-pills { - margin-bottom: 20px; - display: flex; - justify-content: flex-end; -} - -.nav-pills > li > a { - color: #34495e; - border: 1px solid #3498db; - border-radius: 20px; - padding: 8px 16px; - margin-right: 5px; - background-color: #ffffff; - transition: all 0.3s ease; -} - -.nav-pills > li > a:hover { - background-color: #3498db; - color: #ffffff; - border-color: #3498db; -} - -.nav-pills > li.active > a, -.nav-pills > li.active > a:hover, -.nav-pills > li.active > a:focus { - color: #ffffff; - background-color: #3498db; - border-color: #3498db; -} - -.nav-pills > li { - margin-left: 5px; -} - -.nav-pills > li:first-child { - margin-left: 0; -} - -/* Tab content */ -.tab-content { - margin-top: 20px; -} - -.tab-pane { - display: none; -} - -.tab-pane.active, -.tab-pane.show.active { - display: block; -} - -/* Hide toggle panes without collapsing width to preserve DT sizing */ -.hidden-pane { - display: none !important; -} - -/* Generic wrapper to narrow wide plotly widgets (applied only where used) */ -.narrow-plot .html-widget { - width: 600px !important; - max-width: 100% !important; - margin-left: auto !important; - margin-right: auto !important; - box-shadow: none !important; - border: none !important; -} - -/* Remove borders/shadows/backgrounds around ALL Plotly widgets */ -.html-widget, -.js-plotly-plot, -.plot-container, -.svg-container { - background: transparent !important; - border: 0 !important; - box-shadow: none !important; - outline: none !important; -} - -/* Fix Plotly modebar background (Plotly >=5 default is semi-transparent grey) */ -.js-plotly-plot .modebar, -.js-plotly-plot .modebar-group, -.js-plotly-plot .modebar-btn { - background: transparent !important; - box-shadow: none !important; -} - -/* Optional: neutral grey icons for readability */ -.js-plotly-plot .modebar-btn svg, -.js-plotly-plot .modebar-btn path { - fill: #6c757d !important; - color: #6c757d !important; -} - -/* Ensure Plotly x-axis tick labels match the rest of the plots' x-axis text size */ -.js-plotly-plot .xaxislayer-above text { - font-size: 16px !important; -} - -/* Pills positioning for right alignment */ -.nav-pills { - display: flex; - justify-content: flex-end; - margin-bottom: 20px; -} - -.nav-pills > li { - margin-left: 5px; -} - -.nav-pills > li:first-child { - margin-left: 0; -} - -/* Pills container for right-aligned pills */ -.pills-container { - margin-top: 20px; -} - -.pills-header { - display: flex; - justify-content: space-between; - align-items: center; - margin-bottom: 20px; - padding-bottom: 10px; - border-bottom: 1px solid #ecf0f1; -} - -.pills-title { - font-size: 18px; - font-weight: bold; - color: #2c3e50; - margin: 0; -} - -.pills-nav { - flex-shrink: 0; -} - -.pills-nav .nav-pills { - margin-bottom: 0; -} - -.pills-nav .nav-pills > li > a { - font-size: 14px; - padding: 6px 12px; - margin-right: 3px; -} - -/* Code blocks */ -pre { - background-color: #f8f9fa; - border: 1px solid #e9ecef; - border-radius: 4px; - padding: 15px; - font-size: 13px; -} - -code { - background-color: #f8f9fa; - color: #e74c3c; - padding: 2px 4px; - border-radius: 3px; - font-size: 90%; -} - -/* Plot containers */ -.plot-container { - margin: 20px 0; - padding: 15px; - background-color: #ffffff; - border: 1px solid #ecf0f1; - border-radius: 8px; - box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); -} - -/* TOC styling */ -#TOC { - background-color: #f8f9fa; - border: 1px solid #dee2e6; - border-radius: 8px; - padding: 20px; - margin-bottom: 30px; -} - -#TOC ul { - list-style-type: none; - padding-left: 0; -} - -#TOC > ul { - padding-left: 0; -} - -#TOC ul ul { - padding-left: 20px; -} - -#TOC a { - color: #495057; - text-decoration: none; - display: block; - padding: 2px 0; -} - -#TOC a:hover { - color: #3498db; - text-decoration: underline; -} - -/* Responsive design */ -@media (max-width: 768px) { - body { font-size: 12px; } - h1 { font-size: 24px; } - h2 { font-size: 20px; } - h3 { font-size: 18px; } - table { font-size: 12px; } - table th, - table td { padding: 8px 10px; } -} - -/* Print styles */ -@media print { - body { font-size: 12px; line-height: 1.4; } - h1, h2, h3, h4 { page-break-after: avoid; } - table { page-break-inside: avoid; } - .nav-tabs { display: none; } -} - -/* Custom colors for consistency with SQANTI theme */ -.sqanti-primary { color: #3498db; } -.sqanti-secondary { color: #e74c3c; } -.sqanti-success { color: #27ae60; } -.sqanti-warning { color: #f39c12; } - -/* Alert boxes */ -.alert { - padding: 15px; - margin-bottom: 20px; - border: 1px solid transparent; - border-radius: 4px; -} - -.alert-info { - color: #31708f; - background-color: #d9edf7; - border-color: #bce8f1; -} - -.alert-warning { - color: #8a6d3b; - background-color: #fcf8e3; - border-color: #faebcc; -} - -.alert-success { - color: #3c763d; - background-color: #dff0d8; - border-color: #d6e9c6; -} - -.alert-danger { - color: #a94442; - background-color: #f2dede; - border-color: #ebccd1; -} - -/* Reduce extra space before tabsets created by R Markdown {.tabset} sections */ -.section.tabset > h2 { - /* Tighten heading that precedes tabs */ - padding-top: 8px; - padding-bottom: 0; - margin-top: 0; - margin-bottom: 6px; - border-bottom: none; -} - -.section.tabset > .nav-tabs { - /* Minimize gap between heading and tabs */ - margin-top: 0; -} +body { + font-size: 14px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + line-height: 1.6; +} + +td { + font-size: 14px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; +} + +h1 { + font-size: 32px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + color: #2c3e50; + padding-top: 20px; + border-bottom: 3px solid #3498db; + padding-bottom: 10px; +} + +h2 { + font-size: 24px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + color: #2c3e50; + padding-top: 30px; + border-bottom: 2px solid #ecf0f1; + padding-bottom: 8px; +} + +h3 { + font-size: 20px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + color: #34495e; + padding-top: 20px; +} + +h4 { + font-size: 16px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + color: #7f8c8d; + font-style: italic; +} + +/* Right-aligned dropdown toolbar used above tables */ +.dropdown-toolbar { + display: block; + text-align: right; + margin: 0 0 10px 0; +} + +.dropdown-toolbar .btn { + min-width: 220px; + text-align: left; +} + +.dropdown-menu>li>a { + cursor: pointer; +} + +/* Constrain width of single-plot tabs in Sequencing Depth to improve readability */ +.section.level1:has(> h1:contains("Sequencing Depth")) .tab-content .tab-pane>div.html-widget { + max-width: 900px; + margin-left: auto; + margin-right: auto; +} + +.title { + font-size: 36px; + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + color: #e74c3c; + text-align: center; + margin-bottom: 30px; +} + +/* Table styling */ +table { + border-collapse: collapse; + margin: 20px 0; + font-size: 14px; + min-width: 300px; + border-radius: 8px; + overflow: hidden; + box-shadow: 0 0 20px rgba(0, 0, 0, 0.1); +} + +table thead tr { + background-color: #3498db; + color: #ffffff; + text-align: left; + font-weight: bold; +} + +table th, +table td { + padding: 12px 15px; + border: 1px solid #dddddd; +} + +table tbody tr { + border-bottom: 1px solid #dddddd; +} + +table tbody tr:nth-of-type(even) { + background-color: #f8f9fa; +} + +table tbody tr:last-of-type { + border-bottom: 2px solid #3498db; +} + +table tbody tr:hover { + background-color: #e8f4f8; +} + +/* Tab styling */ +.nav-tabs { + border-bottom: 2px solid #ecf0f1; + margin-bottom: 20px; +} + +.nav-tabs>li>a { + color: #495057; + border: none; + border-radius: 8px 8px 0 0; + padding: 12px 20px; + margin-right: 5px; + background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); + font-weight: 500; + font-size: 14px; + transition: all 0.3s ease; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); + line-height: 1.4; +} + +.nav-tabs>li>a:hover { + background: linear-gradient(135deg, #e9ecef 0%, #dee2e6 100%); + color: #2c3e50; +} + +.nav-tabs>li.active>a, +.nav-tabs>li.active>a:hover, +.nav-tabs>li.active>a:focus { + color: white; + background: linear-gradient(135deg, #3498db 0%, #2980b9 100%); + font-weight: bold; + border: none; + box-shadow: 0 4px 12px rgba(52, 152, 219, 0.3); +} + +/* Pill styling */ +.nav-pills { + margin-bottom: 20px; + display: flex; + justify-content: flex-end; +} + +.nav-pills>li>a { + color: #34495e; + border: 1px solid #3498db; + border-radius: 20px; + padding: 8px 16px; + margin-right: 5px; + background-color: #ffffff; + transition: all 0.3s ease; +} + +.nav-pills>li>a:hover { + background-color: #3498db; + color: #ffffff; + border-color: #3498db; +} + +.nav-pills>li.active>a, +.nav-pills>li.active>a:hover, +.nav-pills>li.active>a:focus { + color: #ffffff; + background-color: #3498db; + border-color: #3498db; +} + +.nav-pills>li { + margin-left: 5px; +} + +.nav-pills>li:first-child { + margin-left: 0; +} + +/* Tab content */ +.tab-content { + margin-top: 20px; +} + +.tab-pane { + display: none; +} + +.tab-pane.active, +.tab-pane.show.active { + display: block; +} + +/* Hide toggle panes without collapsing width to preserve DT sizing */ +.hidden-pane { + display: none !important; +} + +.narrow-plot .html-widget { + width: 600px !important; + max-width: 100% !important; + margin-left: auto !important; + margin-right: auto !important; + box-shadow: none !important; + border: none !important; +} + +.html-widget, + +/* Optional: neutral grey icons for readability */ + +/* Pills positioning for right alignment */ +.nav-pills { + display: flex; + justify-content: flex-end; + margin-bottom: 20px; +} + +.nav-pills>li { + margin-left: 5px; +} + +.nav-pills>li:first-child { + margin-left: 0; +} + +/* Pills container for right-aligned pills */ +.pills-container { + margin-top: 20px; +} + +.pills-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 20px; + padding-bottom: 10px; + border-bottom: 1px solid #ecf0f1; +} + +.pills-title { + font-size: 18px; + font-weight: bold; + color: #2c3e50; + margin: 0; +} + +.pills-nav { + flex-shrink: 0; +} + +.pills-nav .nav-pills { + margin-bottom: 0; +} + +.pills-nav .nav-pills>li>a { + font-size: 14px; + padding: 6px 12px; + margin-right: 3px; +} + +/* Code blocks */ +pre { + background-color: #f8f9fa; + border: 1px solid #e9ecef; + border-radius: 4px; + padding: 15px; + font-size: 13px; +} + +code { + background-color: #f8f9fa; + color: #e74c3c; + padding: 2px 4px; + border-radius: 3px; + font-size: 90%; +} + +/* Plot containers */ +.plot-container { + margin: 20px 0; + padding: 15px; + background-color: #ffffff; + border: 1px solid #ecf0f1; + border-radius: 8px; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); +} + +/* TOC styling */ +#TOC { + background-color: #f8f9fa; + border: 1px solid #dee2e6; + border-radius: 8px; + padding: 20px; + margin-bottom: 30px; +} + +#TOC ul { + list-style-type: none; + padding-left: 0; +} + +#TOC>ul { + padding-left: 0; +} + +#TOC ul ul { + padding-left: 20px; +} + +#TOC a { + color: #495057; + text-decoration: none; + display: block; + padding: 2px 0; +} + +#TOC a:hover { + color: #3498db; + text-decoration: underline; +} + +/* Responsive design */ +@media (max-width: 768px) { + body { + font-size: 12px; + } + + h1 { + font-size: 24px; + } + + h2 { + font-size: 20px; + } + + h3 { + font-size: 18px; + } + + table { + font-size: 12px; + } + + table th, + table td { + padding: 8px 10px; + } +} + +/* Print styles */ +@media print { + body { + font-size: 12px; + line-height: 1.4; + } + + h1, + h2, + h3, + h4 { + page-break-after: avoid; + } + + table { + page-break-inside: avoid; + } + + .nav-tabs { + display: none; + } +} + +/* Custom colors for consistency with SQANTI theme */ +.sqanti-primary { + color: #3498db; +} + +.sqanti-secondary { + color: #e74c3c; +} + +.sqanti-success { + color: #27ae60; +} + +.sqanti-warning { + color: #f39c12; +} + +/* Alert boxes */ +.alert { + padding: 15px; + margin-bottom: 20px; + border: 1px solid transparent; + border-radius: 4px; +} + +.alert-info { + color: #31708f; + background-color: #d9edf7; + border-color: #bce8f1; +} + +.alert-warning { + color: #8a6d3b; + background-color: #fcf8e3; + border-color: #faebcc; +} + +.alert-success { + color: #3c763d; + background-color: #dff0d8; + border-color: #d6e9c6; +} + +.alert-danger { + color: #a94442; + background-color: #f2dede; + border-color: #ebccd1; +} + +/* Reduce extra space before tabsets created by R Markdown {.tabset} sections */ +.section.tabset>h2 { + /* Tighten heading that precedes tabs */ + padding-top: 8px; + padding-bottom: 0; + margin-top: 0; + margin-bottom: 6px; + border-bottom: none; +} + +.section.tabset>.nav-tabs { + /* Minimize gap between heading and tabs */ + margin-top: 0; +} + +/* Force DataTables to left-align when autoWidth=FALSE prevents them from filling the container */ +.dataTables_wrapper { + width: 100% !important; +} + +table.dataTable { + margin-left: 0 !important; + margin-right: auto !important; + width: 100% !important; +} \ No newline at end of file diff --git a/tests/sqanti_sc_test.py b/tests/sqanti_sc_test.py index 883afe5..b4665ed 100644 --- a/tests/sqanti_sc_test.py +++ b/tests/sqanti_sc_test.py @@ -502,8 +502,9 @@ def _isfile(path): expected_cmd = ( f"Rscript utilities/SQANTI-sc_report.R " - f"{class_file} {junc_file} {mock_args.report} {prefix} " - f"{mock_args.mode} --cell_summary {cell_summary}" + f"\"{class_file}\" \"{junc_file}\" {mock_args.report} \"{prefix}\" " + f"{mock_args.mode} --cell_summary {cell_summary} " + f"--refGTF \"{mock_args.refGTF}\"" ).strip() actual_cmd = " ".join(mock_run.call_args[0][0].split()) @@ -535,8 +536,8 @@ def _isfile_side_effect(path): assert "--cell_summary" not in actual_cmd expected_cmd = ( f"Rscript utilities/SQANTI-sc_report.R " - f"{class_file} {junc_file} {mock_args.report} {prefix} " - f"{mock_args.mode}" + f"\"{class_file}\" \"{junc_file}\" {mock_args.report} \"{prefix}\" " + f"{mock_args.mode} --refGTF \"{mock_args.refGTF}\"" ).strip() assert actual_cmd == expected_cmd @@ -875,3 +876,301 @@ def parse_args(self_inner): assert mock_clustering.called assert mock_generate_report.called + +# ============================================================================== +# FL Weighting Tests (isoforms mode) +# ============================================================================== + +class TestFLWeightingIsoformsMode: + """ + Tests verifying that in isoforms mode, transcript counts and junction counts + are correctly weighted by the FL (full-length read count) values per cell. + + Core principle + -------------- + Each row in the classification file is a unique isoform model. The FL + column contains comma-separated counts representing how many actual + transcripts of that isoform were observed in each cell barcode listed in + the CB column. All per-cell metrics (structural-category percentages, + junction counts, etc.) must be computed by *summing these FL counts*, NOT + by simply counting isoform model rows. + + For junctions: each junction of a given isoform inherits the same FL counts + as the isoform itself (one junction row per intron per isoform model, so + 3 exons → 2 junction rows, each with FL counts equal to the isoform's FL). + """ + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _cls_row(self, isoform, cb, fl, structural_category, + associated_gene="geneA", exons=2): + """Return one classification row dict (isoforms mode).""" + return { + "isoform": isoform, + "CB": cb, + "FL": fl, + "structural_category": structural_category, + "associated_gene": associated_gene, + "associated_transcript": "txA", + "exons": exons, + "length": 500, + "ref_length": 600, + "chrom": "chr1", + "subcategory": "reference_match", + "all_canonical": "True", + "RTS_stage": "False", + "predicted_NMD": "False", + "within_CAGE_peak": "False", + "polyA_motif_found": "False", + "perc_A_downstream_TTS": "0", + "diff_to_gene_TSS": "0", + "coding": "coding", + "min_cov": "0", + "ratio_TSS": "0", + } + + def _junc_row(self, isoform, junction_category="known", canonical="canonical", + rts=False): + """Return one junction row dict (isoform key only; CB/FL come from cls join).""" + return { + "isoform": isoform, + "junction_category": junction_category, + "canonical": canonical, + "RTS_junction": str(rts), + "junction_number": "1", + "chrom": "chr1", + "strand": "+", + "genomic_start_coord": "1000", + "genomic_end_coord": "2000", + } + + def _run(self, mock_args, tmpdir, cls_rows, junc_rows=None): + """Write files, run calculate_metrics_per_cell, return cell summary DataFrame.""" + out_dir = str(tmpdir.join("output")) + file_acc = "f1" + sampleID = "s1" + sample_dir = os.path.join(out_dir, file_acc) + os.makedirs(sample_dir, exist_ok=True) + prefix = os.path.join(sample_dir, sampleID) + + mock_args.mode = "isoforms" + mock_args.out_dir = out_dir + + pd.DataFrame(cls_rows).to_csv( + f"{prefix}_classification.txt", sep="\t", index=False + ) + + if junc_rows: + pd.DataFrame(junc_rows).to_csv( + f"{prefix}_junctions.txt", sep="\t", index=False + ) + else: + # Empty but valid junctions file + pd.DataFrame(columns=["isoform"]).to_csv( + f"{prefix}_junctions.txt", sep="\t", index=False + ) + + design_df = pd.DataFrame({"sampleID": [sampleID], "file_acc": [file_acc]}) + calculate_metrics_per_cell(mock_args, design_df) + + summary_path = f"{prefix}_SQANTI_cell_summary.txt.gz" + assert os.path.isfile(summary_path), "Cell summary was not created" + return pd.read_csv(summary_path, sep="\t", compression="gzip") + + # ------------------------------------------------------------------ + # Test 1 — total_reads reflects sum(FL), not number of isoform rows + # ------------------------------------------------------------------ + + def test_total_reads_is_fl_sum_not_row_count(self, mock_args, tmpdir): + """ + A cell with 3 FSM isoforms each having FL=5 should have total_reads=15, + not total_reads=3 (the number of isoform model rows). + """ + cls_rows = [ + self._cls_row("iso1", "CB1", "5", "full-splice_match"), + self._cls_row("iso2", "CB1", "5", "full-splice_match"), + self._cls_row("iso3", "CB1", "5", "full-splice_match"), + ] + summary = self._run(mock_args, tmpdir, cls_rows) + cb1 = summary[summary["CB"] == "CB1"].iloc[0] + assert cb1["Transcripts_in_cell"] == 15, ( + f"Expected Transcripts_in_cell=15 (sum of FL counts), got {cb1['Transcripts_in_cell']}" + ) + + # ------------------------------------------------------------------ + # Test 2 — structural category % is FL-weighted + # ------------------------------------------------------------------ + + def test_structural_category_percentage_fl_weighted(self, mock_args, tmpdir): + """ + CB1 has: + iso1 (FSM, FL=1) → 1 FSM transcript + iso2 (NIC, FL=9) → 9 NIC transcripts + + FL-weighted result : FSM_prop = 10 %, NIC_prop = 90 % + Naive row-count : FSM_prop = 50 %, NIC_prop = 50 % (wrong) + """ + cls_rows = [ + self._cls_row("iso1", "CB1", "1", "full-splice_match"), + self._cls_row("iso2", "CB1", "9", "novel_in_catalog"), + ] + summary = self._run(mock_args, tmpdir, cls_rows) + cb1 = summary[summary["CB"] == "CB1"].iloc[0] + + assert abs(cb1["FSM_prop"] - 10.0) < 0.01, ( + f"Expected FSM_prop≈10 % (FL-weighted), got {cb1['FSM_prop']}" + ) + assert abs(cb1["NIC_prop"] - 90.0) < 0.01, ( + f"Expected NIC_prop≈90 % (FL-weighted), got {cb1['NIC_prop']}" + ) + + # ------------------------------------------------------------------ + # Test 3 — junction counts are FL-weighted via isoform-key join + # ------------------------------------------------------------------ + + def test_junction_counts_fl_weighted_via_isoform_join(self, mock_args, tmpdir): + """ + iso1 (FL=1 in CB1): 1 known_canonical junction + iso2 (FL=9 in CB1): 1 novel_not_in_catalog junction + + Each junction inherits the isoform's FL count for that cell, so: + Known_canonical_junctions = 1 (1 × FL=1) + Novel_non_canonical_junctions = 9 (1 × FL=9) ← novel_not_in_catalog + canonical=False → novel_non_canonical + Known_canonical_junctions_prop = 1/(1+9) × 100 = 10 % + + In the junctions file there is no CB column — the CB/FL association is + derived from the classification file via the isoform key, exactly as + SQANTI-sc does it after the fix we applied. + """ + cls_rows = [ + self._cls_row("iso1", "CB1", "1", "full-splice_match", exons=2), + self._cls_row("iso2", "CB1", "9", "novel_not_in_catalog", exons=2), + ] + junc_rows = [ + self._junc_row("iso1", junction_category="known", canonical="canonical"), + self._junc_row("iso2", junction_category="novel", canonical="non_canonical"), + ] + summary = self._run(mock_args, tmpdir, cls_rows, junc_rows) + cb1 = summary[summary["CB"] == "CB1"].iloc[0] + + assert cb1["Known_canonical_junctions"] == 1, ( + f"Expected 1 known_canonical junction (FL=1), got {cb1['Known_canonical_junctions']}" + ) + assert cb1["Novel_non_canonical_junctions"] == 9, ( + f"Expected 9 novel_non_canonical junctions (FL=9), got {cb1['Novel_non_canonical_junctions']}" + ) + assert abs(cb1["Known_canonical_junctions_prop"] - 10.0) < 0.01, ( + f"Expected Known_canonical_junctions_prop≈10 %, got {cb1['Known_canonical_junctions_prop']}" + ) + + # ------------------------------------------------------------------ + # Test 4 — multiple junctions per isoform each inherit FL + # ------------------------------------------------------------------ + + def test_multi_junction_isoform_each_junction_gets_fl_count(self, mock_args, tmpdir): + """ + iso1 has 3 exons → 2 junctions, FL=5 in CB1. + Both junctions are known_canonical, so: + Known_canonical_junctions = 2 × 5 = 10 + """ + cls_rows = [ + self._cls_row("iso1", "CB1", "5", "full-splice_match", exons=3), + ] + junc_rows = [ + self._junc_row("iso1", junction_category="known", canonical="canonical"), + self._junc_row("iso1", junction_category="known", canonical="canonical"), + ] + summary = self._run(mock_args, tmpdir, cls_rows, junc_rows) + cb1 = summary[summary["CB"] == "CB1"].iloc[0] + + assert cb1["Known_canonical_junctions"] == 10, ( + f"Expected Known_canonical_junctions=10 (2 junctions × FL=5), " + f"got {cb1['Known_canonical_junctions']}" + ) + + # ------------------------------------------------------------------ + # Test 5 — same isoform appearing in multiple cells is weighted + # independently per cell + # ------------------------------------------------------------------ + + def test_fl_weighting_is_per_cell_independent(self, mock_args, tmpdir): + """ + iso1 appears in CB1 (FL=3) and CB2 (FL=7) via comma-separated CB/FL. + iso2 appears only in CB2 (FL=2). + + All isoforms are FSM; expected results: + CB1: total_reads=3, FSM_prop=100 % + CB2: total_reads=9, FSM_prop=100 % (iso1 FL=7 + iso2 FL=2) + """ + cls_rows = [ + self._cls_row("iso1", "CB1,CB2", "3,7", "full-splice_match"), + self._cls_row("iso2", "CB2", "2", "full-splice_match"), + ] + summary = self._run(mock_args, tmpdir, cls_rows) + + cb1 = summary[summary["CB"] == "CB1"].iloc[0] + cb2 = summary[summary["CB"] == "CB2"].iloc[0] + + assert cb1["Transcripts_in_cell"] == 3, ( + f"CB1 Transcripts_in_cell should be 3 (FL for CB1), got {cb1['Transcripts_in_cell']}" + ) + assert cb2["Transcripts_in_cell"] == 9, ( + f"CB2 Transcripts_in_cell should be 9 (FL=7 + FL=2), got {cb2['Transcripts_in_cell']}" + ) + assert abs(cb1["FSM_prop"] - 100.0) < 0.01 + assert abs(cb2["FSM_prop"] - 100.0) < 0.01 + + # ------------------------------------------------------------------ + # Test 6 — reads mode uses 1 count per row regardless of FL column + # ------------------------------------------------------------------ + + def test_reads_mode_uses_one_count_per_row(self, mock_args, tmpdir): + """ + In reads mode each classification row is a single read (CB is a single + barcode, FL column is absent or irrelevant). total_reads must equal + the number of rows, not any FL value. + """ + out_dir = str(tmpdir.join("output_reads")) + file_acc = "f1" + sampleID = "s1" + sample_dir = os.path.join(out_dir, file_acc) + os.makedirs(sample_dir, exist_ok=True) + prefix = os.path.join(sample_dir, sampleID) + + mock_args.mode = "reads" + mock_args.out_dir = out_dir + + # 3 reads all from CB1 — no FL column + cls = pd.DataFrame([ + {"isoform": "r1", "CB": "CB1", "structural_category": "full-splice_match", + "associated_gene": "geneA", "associated_transcript": "txA", + "exons": 2, "length": 500, "ref_length": 600, "chrom": "chr1"}, + {"isoform": "r2", "CB": "CB1", "structural_category": "full-splice_match", + "associated_gene": "geneA", "associated_transcript": "txA", + "exons": 2, "length": 500, "ref_length": 600, "chrom": "chr1"}, + {"isoform": "r3", "CB": "CB1", "structural_category": "novel_in_catalog", + "associated_gene": "geneA", "associated_transcript": "txA", + "exons": 2, "length": 500, "ref_length": 600, "chrom": "chr1"}, + ]) + cls.to_csv(f"{prefix}_classification.txt", sep="\t", index=False) + pd.DataFrame(columns=["isoform"]).to_csv( + f"{prefix}_junctions.txt", sep="\t", index=False + ) + + design_df = pd.DataFrame({"sampleID": [sampleID], "file_acc": [file_acc]}) + calculate_metrics_per_cell(mock_args, design_df) + + summary = pd.read_csv( + f"{prefix}_SQANTI_cell_summary.txt.gz", sep="\t", compression="gzip" + ) + cb1 = summary[summary["CB"] == "CB1"].iloc[0] + + # 3 rows → 3 reads (count=1 per row, no FL weighting) + assert cb1["Reads_in_cell"] == 3, ( + f"Reads mode: expected Reads_in_cell=3 (one per row), got {cb1['Reads_in_cell']}" + ) + assert abs(cb1["FSM_prop"] - (2 / 3 * 100)) < 0.1, ( + f"Reads mode: expected FSM_prop≈66.7 %, got {cb1['FSM_prop']}" + )