diff --git a/R/flatten.R b/R/flatten.R new file mode 100644 index 0000000..8b640ee --- /dev/null +++ b/R/flatten.R @@ -0,0 +1,34 @@ +#' Bind rows in a list of list of data frames +#' +#' This is a utility function that converts a list of lists, each containing +#' a names list of data frames into a single flat list of data frames. +#' @param x list; a list which contains a named list of data frames +#' @return a flat list of data frames where lists of data frames with the same +#' name are merged. +flatten <- function(x) { + is_named_list <- function(x) { + !is.null(names(x)) && all(names(x) != "") + } + #if (!is_named_list(x)) { + # stop("'x' must be a named list.") + #} + if (any(!sapply(x, is_named_list))) { + stop("Each element of 'x' must be a named list.") + } + df_names <- lapply(x, names) |> unlist() |> unique() + out <- setNames(lapply(df_names, function(x) tibble::tibble()), df_names) + for (i in seq_along(out)) { + df_name <- names(out)[i] + for (j in seq_along(x)) { + if (df_name %in% names(x[[j]])) { + out[[i]] <- dplyr::bind_rows( + out[[i]], + x[[j]][[names(out)[i]]] + ) + } else { + next() + } + } + } + return(out) +} \ No newline at end of file diff --git a/R/ncbi_get_meta.R b/R/ncbi_get_meta.R index 37e6f25..1bd716f 100644 --- a/R/ncbi_get_meta.R +++ b/R/ncbi_get_meta.R @@ -163,14 +163,7 @@ ncbi_get_meta <- function( if (verbose) { message("Attempting to parse retrieved metadata.") } - res_parsed <- ncbi_parse(meta = res, mc_cores = mc_cores, verbose = verbose) - if (all("data.frame" %in% class(res_parsed))) { - out <- dplyr::bind_rows(res_parsed) - if (verbose) message("Done.") - } else { - if (verbose) message("Returning unparsed metadata.") - out <- res - } + out <- ncbi_parse(meta = res, mc_cores = mc_cores, verbose = verbose) } else { out <- res } diff --git a/R/ncbi_parse_biosample_xml.R b/R/ncbi_parse_biosample_xml.R index c4e6b4e..a6ed16b 100644 --- a/R/ncbi_parse_biosample_xml.R +++ b/R/ncbi_parse_biosample_xml.R @@ -19,6 +19,7 @@ ncbi_parse_biosample_xml <- function( mc_cores <- get_mc_cores(mc_cores, verbose = verbose) biosample_df <- data.frame() if (verbose) message("Attempting to parse BioSample XMLs.") + result <- list() for (i in seq_along(biosample_xml)) { if (verbose) message( "BioSample XML ", i, " of ", length(biosample_xml), @@ -37,7 +38,7 @@ ncbi_parse_biosample_xml <- function( attributes(x)$accession }) if (verbose) message( - "List to data frame.. ", appendLF = FALSE + "List to a list of data frames.. ", appendLF = FALSE ) pfoo <- function(x) { entry <- try( @@ -85,16 +86,14 @@ ncbi_parse_biosample_xml <- function( out <- out[-index_failed] } if (verbose) message("Successful.") - out <- dplyr::bind_rows(out) - out <- dplyr::relocate(out, "biosample_uid") - out <- dplyr::relocate(out, "biosample", .after = "biosample_uid") - biosample_df <- dplyr::bind_rows(biosample_df, out) + result[[i]] <- webseq:::flatten(out) } - out <- tibble::as_tibble(biosample_df) - out <- out[, order(unname(sapply(out, function(x) sum(is.na(x)))))] - out <- dplyr::relocate(out, "biosample_uid") - out <- dplyr::relocate(out, "biosample", .after = "biosample_uid") - return(out) + result <- webseq:::flatten(result) + result$main <- result$main[, order(unname(sapply(result$main, function(x) sum(is.na(x)))))] + result$main <- result$main |> + dplyr::relocate(biosample_uid) |> + dplyr::relocate(biosample, .after = biosample_uid) + return(result) } @@ -165,6 +164,25 @@ ncbi_parse_biosample_xml_entry <- function(x, verbose = getOption("verbose")) { )) } } + # if description contains a table, extract it + if (grepl("^WEBSEQ", out$description)) { + description_table <- extract_description_table(x, main_attrs$accession, verbose) + out_tbl <- dplyr::bind_cols( + tibble::tibble( + biosample_uid = main_attrs$id, + biosample = main_attrs$accession + ), + description_table + ) + out <- list( + out, + out_tbl + ) + out <- setNames(out, c("main", attributes(description_table)$caption)) + } else { + out <- list(out) + out <- setNames(out, "main") + } return(out) } @@ -399,6 +417,57 @@ extract_description <- function( return(out) } +extract_description_table <- function( + x, + biosample, + verbose = getOption("verbose") + ) { + foo <- function (x, biosample, verbose) { + out <- data.frame() + if ("Comment" %in% names(x$Description)) { + if ("Table" %in% names(x$Description$Comment)) { + # works if there is a single table + # TODO find an example with multiple tables + table <- x$Description$Comment$Table + if ("Caption" %in% names(table)) { + caption <- unname(unlist(table$Caption)) + if (length(caption) == 1) { + headers <- table$Header |> unlist() |> unname() + out <- lapply(table$Body, function(x) { + df <- x |> t() |> as.data.frame() + names(df) <- headers + return(df) + }) |> dplyr::bind_rows() + out <- lapply(out, function(x) unname(unlist(x))) + out_lens <- sapply(out, length) + index <- which(out_lens == 0) + if (length(index) > 0) { + for (i in index) { + out[[i]] <- rep(NA, times = max(out_lens)) + } + } + out <- data.frame(out) |> tibble::as_tibble() + attr(out, "caption") <- tolower(caption) + } else { + stop("Multiple tables.") + } + } + } + } + return(out) + } + out <- try(foo(x, biosample, verbose), silent = TRUE) + if (inherits(out, "try-error") | is.null(out)) { + if (verbose) { + message(paste0( + "Could not extract Description table for BioSample ", biosample, "." + )) + } + stop() + } + return(out) +} + # test <- ncbi_get_uid("pathogen cl 1 0[filter]", "biosample") extract_package <- function( x, diff --git a/man/flatten.Rd b/man/flatten.Rd new file mode 100644 index 0000000..4864fd3 --- /dev/null +++ b/man/flatten.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/flatten.R +\name{flatten} +\alias{flatten} +\title{Bind rows in a list of list of data frames} +\usage{ +flatten(x) +} +\arguments{ +\item{x}{list; a list which contains a named list of data frames} +} +\value{ +a flat list of data frames where lists of data frames with the same +name are merged. +} +\description{ +This is a utility function that converts a list of lists, each containing +a names list of data frames into a single flat list of data frames. +} diff --git a/tests/testthat/test-ncbi_get_meta.R b/tests/testthat/test-ncbi_get_meta.R index d436209..b460278 100644 --- a/tests/testthat/test-ncbi_get_meta.R +++ b/tests/testthat/test-ncbi_get_meta.R @@ -4,7 +4,11 @@ test_that("ncbi_get_meta() works with history", { # with history, use history, one batch uids <- ncbi_get_uid("Microthrix parvicella", db = "biosample") meta <- suppressWarnings(ncbi_get_meta(uids)) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) # with history, use history, multiple batches uids <- ncbi_get_uid( @@ -13,8 +17,12 @@ test_that("ncbi_get_meta() works with history", { batch_size = 5 ) meta <- suppressWarnings(ncbi_get_meta(uids)) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) - expect_equal(nrow(meta), length(uids$uid)) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) + testthat::expect_equal(nrow(meta$main), length(uids$uid)) }) test_that("ncbi_get_meta() works without history", { @@ -25,14 +33,24 @@ test_that("ncbi_get_meta() works without history", { use_history = TRUE ) meta <- suppressWarnings(ncbi_get_meta(uids, use_history = FALSE)) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) + # with history, do not use history, multiple batches meta <- suppressWarnings(ncbi_get_meta( uids, use_history = FALSE, batch_size = 5 )) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) + # without history, attempt to use history but fall back, one batch uids <- ncbi_get_uid( "Microthrix parvicella", @@ -40,13 +58,21 @@ test_that("ncbi_get_meta() works without history", { use_history = FALSE ) meta <- suppressWarnings(ncbi_get_meta(uids)) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) # only ids, one batch meta <- suppressWarnings(ncbi_get_meta( uids$uid, db = "biosample" )) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) # only ids, multiple batches meta <- suppressWarnings(ncbi_get_meta( @@ -54,7 +80,11 @@ test_that("ncbi_get_meta() works without history", { db = "biosample", batch_size = 5 )) - expect_s3_class(meta, c("ncbi_meta", "tbl_df", "tbl", "data.frame")) + testthat::expect_true(inherits(meta, "list")) + testthat::expect_s3_class( + meta$main, + c("ncbi_meta", "tbl_df", "tbl", "data.frame") + ) }) test_that("ncbi_get_meta() works with all supported dbs", { diff --git a/tests/testthat/test-ncbi_parse.R b/tests/testthat/test-ncbi_parse.R index f5d5e5c..e836945 100644 --- a/tests/testthat/test-ncbi_parse.R +++ b/tests/testthat/test-ncbi_parse.R @@ -4,15 +4,20 @@ test_that("ncbi_parse() works with a BioSample", { data(examples) biosample_uid <- ncbi_get_uid(examples$biosample[1], db = "biosample") res <- ncbi_get_meta(biosample_uid) - expect_s3_class(res, c("tbl_df", "tbl", "data.frame")) - expect_equal(res$biosample, "SAMN02714232") + testthat::expect_true(inherits(res, "list")) + testthat::expect_s3_class(res$main, c("tbl_df", "tbl", "data.frame")) + expect_equal(res$main$biosample, "SAMN02714232") }) test_that("ncbi_parse() works with all BioSamples", { data(examples) biosample_uid <- ncbi_get_uid(examples$biosample, db = "biosample") res <- ncbi_get_meta(biosample_uid) - expect_s3_class(res, c("tbl_df", "tbl", "data.frame")) - expect_equal(nrow(res), length(examples$biosample)) + testthat::expect_true(inherits(res, "list")) + testthat::expect_equal(length(res), 2) + testthat::expect_equal(names(res), c("main", "antibiogram")) + testthat::expect_s3_class(res$main, c("tbl_df", "tbl", "data.frame")) + testthat::expect_s3_class(res$antibiogram, c("tbl_df", "tbl", "data.frame")) + expect_equal(nrow(res$main), length(examples$biosample)) })