diff --git a/R/data_curation.R b/R/data_curation.R index b1f1af5..0d19113 100644 --- a/R/data_curation.R +++ b/R/data_curation.R @@ -45,8 +45,10 @@ } # Parse - df <- utils::read.table(text = raw_data, sep = "\t", header = TRUE, fill = TRUE, - quote = "", check.names = FALSE, comment.char = "") + df <- utils::read.table( + text = raw_data, sep = "\t", header = TRUE, fill = TRUE, + quote = "", check.names = FALSE, comment.char = "" + ) df <- tibble::as_tibble(df) |> dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) @@ -61,7 +63,7 @@ # Make sure the BV-BRC metadata live where they're supposed to .ensure_bvbrc_cache <- function(base_dir = ".", verbose = TRUE, - cache_rel = file.path("data", "bvbrc", "bvbrcData.duckdb"), + cache_rel = file.path("data", "bvbrc", "bvbrcData.duckdb"), cache_table = "bvbrc_bac_data") { base_dir <- normalizePath(base_dir, mustWork = FALSE) cache_db <- file.path(base_dir, cache_rel) @@ -115,12 +117,12 @@ base_dir <- normalizePath(base_dir, mustWork = FALSE) data_dir <- file.path(base_dir, "data") bvbrc_dir <- file.path(data_dir, "bvbrc") - logs_dir <- file.path(data_dir, "logs") + logs_dir <- file.path(data_dir, "logs") dir.create(bvbrc_dir, recursive = TRUE, showWarnings = FALSE) - dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) + dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) - db_path <- file.path(bvbrc_dir, "bvbrcData.duckdb") + db_path <- file.path(bvbrc_dir, "bvbrcData.duckdb") table_name <- "bvbrc_bac_data" meta_table <- "__meta" @@ -128,10 +130,10 @@ DBI::dbExecute( con, - glue::glue('CREATE TABLE IF NOT EXISTS {meta_table} ( + glue::glue("CREATE TABLE IF NOT EXISTS {meta_table} ( table_name TEXT PRIMARY KEY, last_updated TIMESTAMP - )') + )") ) # Tiny update check helper @@ -139,12 +141,14 @@ res <- tryCatch( DBI::dbGetQuery( con, - glue::glue('SELECT last_updated FROM {meta_table} - WHERE table_name = {DBI::dbQuoteString(con, table_name)}') + glue::glue("SELECT last_updated FROM {meta_table} + WHERE table_name = {DBI::dbQuoteString(con, table_name)}") ), error = function(e) NULL ) - if (is.null(res) || nrow(res) == 0L) return(NA) + if (is.null(res) || nrow(res) == 0L) { + return(NA) + } as.POSIXct(res$last_updated[[1]], origin = "1970-01-01", tz = "UTC") } @@ -268,7 +272,6 @@ } - #' Resolve `query value` from `user_bacs` for [.getGenomeIDs()] #' #' If query_value is NULL, derive it from user_bacs based on query_type. @@ -277,7 +280,9 @@ #' If nothing suitable is found, throw a fit and an error. #' @keywords internal .resolveQueryValue <- function(query_type, query_value, user_bacs) { - if (!is.null(query_value) && nzchar(query_value)) return(query_value) + if (!is.null(query_value) && nzchar(query_value)) { + return(query_value) + } if (missing(user_bacs) || length(user_bacs) == 0) { stop("Provide query_value or user_bacs for the selected query_type.") } @@ -359,7 +364,7 @@ stringr::str_replace_all("\\s+", "_") |> stringr::str_replace_all("[^A-Za-z0-9._-]", "") - db_dir <- file.path(data_dir, bug_dirname) + db_dir <- file.path(data_dir, bug_dirname) dir.create(db_dir, recursive = TRUE, showWarnings = FALSE) db_file <- paste0(.generateDBname(user_bacs), ".duckdb") @@ -397,14 +402,13 @@ overwrite = FALSE, image = "danylmb/bvbrc:5.3", verbose = TRUE) { - - query_type <- match.arg(query_type) + query_type <- match.arg(query_type) query_value <- .resolveQueryValue(query_type, query_value, user_bacs) - + if (isTRUE(verbose)) { message("Querying BV-BRC: ", query_type, " == ", query_value) } - + # Count count_cmd <- paste0( "docker run --rm ", image, @@ -414,10 +418,10 @@ " --in genome_status,WGS,Complete", " --count" ) - count_lines <- tryCatch(system(count_cmd, intern = TRUE), error = function(e) character()) + count_lines <- tryCatch(system(count_cmd, intern = TRUE), error = function(e) character()) count_result <- suppressWarnings(as.integer(if (length(count_lines) >= 2) count_lines[2] else NA_integer_)) if (isTRUE(verbose) && !is.na(count_result)) message("Count returned: ", count_result) - + # Details data_cmd <- paste0( "docker run --rm ", image, @@ -429,7 +433,7 @@ ) data_raw <- tryCatch(system(data_cmd, intern = TRUE), error = function(e) character()) if (length(data_raw) == 0L) stop("BV-BRC returned no data for: ", query_type, " = ", query_value) - + data_result <- tibble::as_tibble( utils::read.table( text = data_raw, sep = "\t", header = TRUE, fill = TRUE, @@ -443,16 +447,16 @@ `genome.species` = as.character(`genome.species`), `genome.strain` = as.character(`genome.strain`) ) - + # Per-bug DB path - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path - + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) DBI::dbWriteTable(con, "bac_data", data_result, overwrite = TRUE) - + if (isTRUE(verbose)) message("Wrote table 'bac_data' to: ", db_path) - + list(count_result = count_result, duckdbConnection = con, table_name = "bac_data") } @@ -474,28 +478,28 @@ overwrite = FALSE, verbose = TRUE) { base_dir <- normalizePath(base_dir, mustWork = FALSE) - + if (isTRUE(verbose)) message("Resolving input taxa.") bac_input_data <- .retrieveCustomQuery(base_dir = base_dir, user_bacs = user_bacs) - + if (is.null(bac_input_data) || nrow(bac_input_data) == 0) { message("No valid input provided or no matches found.") return(NULL) } - + # Resolve per-bug DB path - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path - - bac_data <- tibble::tibble() + + bac_data <- tibble::tibble() taxon_ids <- unique(bac_input_data$genome.taxon_id) - + if (isTRUE(verbose)) message("Querying BV-BRC for ", length(taxon_ids), " taxon IDs.") - + for (i in seq_along(taxon_ids)) { tax <- taxon_ids[i] if (isTRUE(verbose)) message("Taxon ", i, "/", length(taxon_ids), ": ", tax) - + res <- .getGenomeIDs( base_dir = base_dir, query_type = "taxon_id", @@ -504,22 +508,22 @@ overwrite = TRUE, verbose = verbose ) - + con <- res$duckdbConnection tbl <- res$table_name each_bac_data <- tibble::as_tibble(DBI::dbReadTable(con, tbl)) bac_data <- dplyr::bind_rows(bac_data, each_bac_data) - + # Close per-iteration connection to avoid open handles piling up try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE) } - + if (nrow(bac_data) > 0) { genome_ids <- bac_data |> dplyr::distinct(`genome.genome_id`) |> dplyr::pull(`genome.genome_id`) genome_ids <- genome_ids[!is.na(genome_ids)] - + if (length(genome_ids) > 0) { if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs.") return(genome_ids) @@ -578,18 +582,18 @@ verbose = TRUE) { base_dir <- normalizePath(base_dir, mustWork = FALSE) data_dir <- file.path(base_dir, "data") - tmp_dir <- file.path(data_dir, "tmp") + tmp_dir <- file.path(data_dir, "tmp") dir.create(tmp_dir, recursive = TRUE, showWarnings = FALSE) - + if (isTRUE(verbose)) { message("Preparing AMR query input for ", length(batch_genome_IDs), " genomes.") } - + docker_path <- Sys.which("docker") if (!nzchar(docker_path)) { stop("Docker is not available on your PATH but is required.") } - + # Generate genome list with p3-echo echo_args <- c( "run", "--rm", @@ -599,12 +603,12 @@ genome_ids_output <- suppressWarnings( system2("docker", args = echo_args, stdout = TRUE, stderr = TRUE) ) - + # Write a temporary file in data/tmp/ tmp_in <- tempfile(tmpdir = tmp_dir, pattern = "genome_drug_ids_", fileext = ".tsv") writeLines(genome_ids_output, con = tmp_in) tmp_in_mounted <- file.path("/data", "tmp", basename(tmp_in)) - + # Allow abx_filter to be a single string with spaces OR a vector of args abx_args <- if (length(abx_filter) == 1L) { @@ -612,7 +616,7 @@ } else { abx_filter } - + # Query drug data drug_args <- c( "run", "--rm", @@ -622,15 +626,15 @@ abx_args, "--attr", drug_fields ) - + if (isTRUE(verbose)) { message("Running AMR query.") } drug_data <- suppressWarnings(system2("docker", args = drug_args, stdout = TRUE, stderr = TRUE)) - + # Clean up after yourself try(unlink(tmp_in), silent = TRUE) - + return(drug_data) } @@ -655,18 +659,18 @@ verbose = TRUE) { base_dir <- normalizePath(base_dir, mustWork = FALSE) data_dir <- file.path(base_dir, "data") - tmp_dir <- file.path(data_dir, "tmp") + tmp_dir <- file.path(data_dir, "tmp") dir.create(tmp_dir, recursive = TRUE, showWarnings = FALSE) - + if (isTRUE(verbose)) { message("Preparing genome metadata input for ", length(batch_genome_IDs), " genomes.") } - + docker_path <- Sys.which("docker") if (!nzchar(docker_path)) { stop("Docker is not available on your PATH but is required.") } - + # Generate genome list with p3-echo echo_args <- c( "run", "--rm", @@ -676,14 +680,14 @@ genome_ids_output <- suppressWarnings( system2("docker", args = echo_args, stdout = TRUE, stderr = TRUE) ) - + tmp_in <- tempfile(tmpdir = tmp_dir, pattern = "genome_ids_", fileext = ".tsv") writeLines(genome_ids_output, con = tmp_in) tmp_in_mounted <- file.path("/data", "tmp", basename(tmp_in)) - + # Choose attributes (AMR for this pipeline) chosen_fields <- if (identical(filter_type, "AMR")) amr_fields else microtrait_fields - + get_args <- c( "run", "--rm", "-v", paste0(data_dir, ":/data"), @@ -691,15 +695,15 @@ "--input", tmp_in_mounted, "--attr", chosen_fields ) - + if (isTRUE(verbose)) { message("Running genome metadata query.") } genome_data <- suppressWarnings(system2("docker", args = get_args, stdout = TRUE, stderr = TRUE)) - + # Cleaning up try(unlink(tmp_in), silent = TRUE) - + return(genome_data) } @@ -735,8 +739,10 @@ retrieveMetadata <- function(user_bacs, base_dir <- normalizePath(base_dir, mustWork = FALSE) if (isTRUE(verbose)) message("Resolving genome IDs for user inputs.") - genome_ids <- .retrieveQueryIDs(base_dir = base_dir, user_bacs = user_bacs, - overwrite = overwrite, verbose = verbose) + genome_ids <- .retrieveQueryIDs( + base_dir = base_dir, user_bacs = user_bacs, + overwrite = overwrite, verbose = verbose + ) if (length(genome_ids) == 0) { message("No genome IDs available for the specified inputs.") return(NULL) @@ -753,8 +759,11 @@ retrieveMetadata <- function(user_bacs, "pmid,resistant_phenotype,", "source,taxon_id,testing_standard" ) - abx_filter <- if (identical(abx, "All")) "--required antibiotic" - else paste0("--in antibiotic,", paste(abx, collapse = ",")) + abx_filter <- if (identical(abx, "All")) { + "--required antibiotic" + } else { + paste0("--in antibiotic,", paste(abx, collapse = ",")) + } amr_fields <- paste0( "assembly_accession,assembly_method,", "bioproject_accession,biosample_accession,", @@ -808,7 +817,10 @@ retrieveMetadata <- function(user_bacs, ), envir = environment() ) - parallel::clusterEvalQ(cluster, { library(tibble); library(dplyr) }) + parallel::clusterEvalQ(cluster, { + library(tibble) + library(dplyr) + }) if (isTRUE(verbose)) message("Retrieving AMR phenotype data in batches.") batch_drug_data <- parallel::parLapply(cluster, genome_batches, function(batch) { @@ -822,7 +834,10 @@ retrieveMetadata <- function(user_bacs, ) }) combined_drug_data <- unlist(batch_drug_data, use.names = FALSE) - if (length(combined_drug_data) == 0) { message("No drug data returned."); return(NULL) } + if (length(combined_drug_data) == 0) { + message("No drug data returned.") + return(NULL) + } combined_drug_data_tbl <- tibble::as_tibble( utils::read.table( @@ -847,7 +862,10 @@ retrieveMetadata <- function(user_bacs, ) }) combined_genome_data <- unlist(batch_genome_data, use.names = FALSE) - if (length(combined_genome_data) == 0) { message("No genome data returned."); return(NULL) } + if (length(combined_genome_data) == 0) { + message("No genome data returned.") + return(NULL) + } combined_genome_data_tbl <- tibble::as_tibble( utils::read.table( @@ -860,13 +878,14 @@ retrieveMetadata <- function(user_bacs, dplyr::mutate(`genome.genome_id` = as.character(`genome.genome_id`)) # Per-bug DB path - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path logs_dir <- file.path(base_dir, "data", "logs") dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) cat(sprintf("[%s] Writing metadata DuckDB: %s\n", Sys.time(), db_path), - file = file.path(logs_dir, "bvbrc.log"), append = TRUE) + file = file.path(logs_dir, "bvbrc.log"), append = TRUE + ) con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) DBI::dbWriteTable(con, "amr_phenotype", combined_drug_data_tbl, overwrite = TRUE) @@ -892,10 +911,14 @@ retrieveMetadata <- function(user_bacs, # FASTA sanitizer to ensure Panaroo compatibility with BV-BRC CLI downloads .strip_fasta_preamble <- function(fna_path) { - if (!file.exists(fna_path)) return(invisible(FALSE)) + if (!file.exists(fna_path)) { + return(invisible(FALSE)) + } txt <- readLines(fna_path, warn = FALSE) first <- which(grepl("^\\s*>", txt))[1] - if (is.na(first)) return(invisible(FALSE)) + if (is.na(first)) { + return(invisible(FALSE)) + } if (first > 1L) { txt <- txt[first:length(txt)] txt[1] <- sub("^\\ufeff", "", txt[1]) @@ -907,14 +930,20 @@ retrieveMetadata <- function(user_bacs, # GFF sanitizer to ensure Panaroo compatibility with BV-BRC CLI downloads .sanitize_gff <- function(gff_path) { - if (!file.exists(gff_path)) return(invisible(FALSE)) + if (!file.exists(gff_path)) { + return(invisible(FALSE)) + } lines <- readLines(gff_path, warn = FALSE) - if (length(lines) == 0L) return(invisible(FALSE)) + if (length(lines) == 0L) { + return(invisible(FALSE)) + } if (!grepl("^##gff-version\\s*3", lines[1])) { lines <- c("##gff-version 3", lines) } out <- vapply(lines, function(line) { - if (grepl("^#", line)) return(line) + if (grepl("^#", line)) { + return(line) + } parts <- strsplit(line, "[\t ]", perl = TRUE)[[1]] if (length(parts) >= 9) { paste(c(parts[1:8], paste(parts[9:length(parts)], collapse = " ")), collapse = "\t") @@ -943,16 +972,19 @@ retrieveMetadata <- function(user_bacs, verbose = TRUE, fallback_to_bvbrc_cache = TRUE) { base_dir <- normalizePath(base_dir, mustWork = FALSE) - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - db_path <- paths$db_path - + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + db_path <- paths$db_path + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) - - on.exit({ - # Keep connection open for caller - NULL - }, add = TRUE) - + + on.exit( + { + # Keep connection open for caller + NULL + }, + add = TRUE + ) + # Good metadata present? Apply AMR filters if ("metadata" %in% DBI::dbListTables(con)) { if (isTRUE(verbose)) message("Loading metadata for filtering.") @@ -962,7 +994,7 @@ retrieveMetadata <- function(user_bacs, message("No data available in 'metadata'.") return(NULL) } - + # Normalize evidence labels initial_metadata <- tibble::as_tibble(initial_metadata) |> dplyr::mutate( @@ -973,13 +1005,13 @@ retrieveMetadata <- function(user_bacs, TRUE ~ `genome_drug.evidence` ) ) - + # AMR and quality filtering filtered_metadata <- initial_metadata |> dplyr::filter(`genome_drug.evidence` == "Laboratory Method") |> dplyr::filter(`genome.genome_quality` == "Good") |> dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate")) - + DBI::dbWriteTable(con, "filtered", filtered_metadata, overwrite = TRUE) if (isTRUE(verbose)) { message("Post-filter rows: ", nrow(filtered_metadata)) @@ -987,42 +1019,42 @@ retrieveMetadata <- function(user_bacs, } return(list(duckdbConnection = con, table_name = "filtered")) } - + # Attempt BV-BRC cache location if (!isTRUE(fallback_to_bvbrc_cache)) { DBI::dbDisconnect(con, shutdown = TRUE) stop("No 'metadata' table found in ", db_path, ". Run retrieveMetadata() first.") } - + if (isTRUE(verbose)) { message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.") } - + cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb") if (!file.exists(cache_db)) { DBI::dbDisconnect(con, shutdown = TRUE) stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.") } - + con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db) on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE) - + if (!"bvbrc_bac_data" %in% DBI::dbListTables(con_cache)) { DBI::dbDisconnect(con, shutdown = TRUE) stop("Table 'bvbrc_bac_data' not found in BV-BRC cache: ", cache_db) } - + bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data")) - - + + # Derive matches from user_bacs (taxon IDs or species) sel <- tibble::tibble(`genome.genome_id` = character()) - + for (v in user_bacs) { if (suppressWarnings(!is.na(as.numeric(v)))) { # numeric taxon_id match matches <- bv[bv$`genome.taxon_id` == v | - bv$`genome.taxon_id` == as.numeric(v), , drop = FALSE] + bv$`genome.taxon_id` == as.numeric(v), , drop = FALSE] } else { # species substring (case-insensitive) matches <- bv[stringr::str_detect( @@ -1030,7 +1062,7 @@ retrieveMetadata <- function(user_bacs, stringr::fixed(v, ignore_case = TRUE) ), , drop = FALSE] } - + if (nrow(matches)) { sel <- dplyr::bind_rows( sel, @@ -1041,22 +1073,22 @@ retrieveMetadata <- function(user_bacs, } } sel <- dplyr::distinct(sel) - + if (nrow(sel) == 0L) { DBI::dbDisconnect(con, shutdown = TRUE) stop("No genomes matched user_bacs in BV-BRC cache.") } - + # Minimal 'filtered' for downstream steps (downloaders & genomeList) minimal_filtered <- sel # Ensure expected column name used downstream: # retrieveGenomes() reads "filtered" and expects `genome.genome_id` to exist. DBI::dbWriteTable(con, "filtered", minimal_filtered, overwrite = TRUE) - + if (isTRUE(verbose)) { message("Wrote table 'filtered' to: ", db_path) } - + list(duckdbConnection = con, table_name = "filtered") } @@ -1068,9 +1100,12 @@ retrieveMetadata <- function(user_bacs, # Prefer bash .pick_shell <- function(image) { chk <- suppressWarnings(system2("docker", - c("run", "--rm", image, "sh", "-lc", - "command -v bash >/dev/null || echo NOBASH"), - stdout = TRUE, stderr = TRUE)) + c( + "run", "--rm", image, "sh", "-lc", + "command -v bash >/dev/null || echo NOBASH" + ), + stdout = TRUE, stderr = TRUE + )) if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash" } @@ -1117,21 +1152,28 @@ retrieveMetadata <- function(user_bacs, .ftp_download_one <- function(genomeID, out_dir, tries = 3L, min_bytes = 100) { files <- c(".fna", ".PATRIC.faa", ".PATRIC.gff") dests <- file.path(out_dir, paste0(genomeID, files)) - if (.is_complete_set(out_dir, genomeID, min_bytes)) return(TRUE) + if (.is_complete_set(out_dir, genomeID, min_bytes)) { + return(TRUE) + } get_one <- function(url, dest) { if (nzchar(Sys.which("wget"))) { res <- suppressWarnings(system2("wget", - c("-q", "-O", shQuote(dest), shQuote(url)), - stdout = TRUE, stderr = TRUE)) - st <- attr(res, "status"); if (is.null(st)) st <- 0L + c("-q", "-O", shQuote(dest), shQuote(url)), + stdout = TRUE, stderr = TRUE + )) + st <- attr(res, "status") + if (is.null(st)) st <- 0L return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes) } else if (nzchar(Sys.which("curl"))) { - curl_args <- if (startsWith(url, "ftps://")) - c("--ssl-reqd", "-L", "-o", shQuote(dest), shQuote(url)) else - c("-L", "-o", shQuote(dest), shQuote(url)) + curl_args <- if (startsWith(url, "ftps://")) { + c("--ssl-reqd", "-L", "-o", shQuote(dest), shQuote(url)) + } else { + c("-L", "-o", shQuote(dest), shQuote(url)) + } res <- suppressWarnings(system2("curl", curl_args, stdout = TRUE, stderr = TRUE)) - st <- attr(res, "status"); if (is.null(st)) st <- 0L + st <- attr(res, "status") + if (is.null(st)) st <- 0L return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes) } FALSE @@ -1140,13 +1182,27 @@ retrieveMetadata <- function(user_bacs, for (ext_i in seq_along(files)) { dest <- dests[ext_i] if (file.exists(dest) && file.info(dest)$size > min_bytes) next - ftps <- sprintf("ftps://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i]) + ftps <- sprintf("ftps://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i]) https <- sprintf("https://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i]) ok <- FALSE - for (k in 1:tries) { if (get_one(ftps, dest)) { ok <- TRUE; break } } - if (!ok) for (k in 1:2) { if (get_one(https, dest)) { ok <- TRUE; break } } - if (!ok) return(FALSE) + for (k in 1:tries) { + if (get_one(ftps, dest)) { + ok <- TRUE + break + } + } + if (!ok) { + for (k in 1:2) { + if (get_one(https, dest)) { + ok <- TRUE + break + } + } + } + if (!ok) { + return(FALSE) + } } .is_complete_set(out_dir, genomeID, min_bytes) } @@ -1155,7 +1211,7 @@ retrieveMetadata <- function(user_bacs, # p3-dump-genomes to fetch FASTA and .gto files .cli_dump_fastas_gto_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) { - out_dir <- normalizePath(out_dir, mustWork = FALSE) + out_dir <- normalizePath(out_dir, mustWork = FALSE) dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) ids_file <- file.path(out_dir, paste0("ids_", tag, ".txt")) writeLines(genome_ids, ids_file) @@ -1164,12 +1220,15 @@ retrieveMetadata <- function(user_bacs, mount <- .docker_path(out_dir) # Safety against Windows-specific CRLF lines before `p3-dump-genomes` - sh_cmd <- sprintf('tr -d "\\r" < /out/%s | p3-dump-genomes --outDir /out --fasta --prot --gto -', - basename(ids_file)) + sh_cmd <- sprintf( + 'tr -d "\\r" < /out/%s | p3-dump-genomes --outDir /out --fasta --prot --gto -', + basename(ids_file) + ) - args <- c("run", "--rm", "-v", paste0(mount, ":/out"), image, shell, "-lc", shQuote(sh_cmd)) + args <- c("run", "--rm", "-v", paste0(mount, ":/out"), image, shell, "-lc", shQuote(sh_cmd)) res <- suppressWarnings(system2("docker", args = args, stdout = TRUE, stderr = TRUE)) - st <- attr(res, "status"); if (is.null(st)) st <- 0L + st <- attr(res, "status") + if (is.null(st)) st <- 0L if (st != 0L && tries > 1L) { Sys.sleep(1) @@ -1197,57 +1256,57 @@ retrieveMetadata <- function(user_bacs, # Export GFF from GTO per genomes in each chunk .cli_export_gff_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) { - out_dir <- normalizePath(out_dir, mustWork = FALSE) + out_dir <- normalizePath(out_dir, mustWork = FALSE) ids_file <- file.path(out_dir, paste0("ids_", tag, ".txt")) writeLines(genome_ids, ids_file) - exporter <- "/usr/bin/rast-export-genome" - shell <- .pick_shell(image) - mount <- .docker_path(out_dir) + exporter <- "/usr/bin/rast-export-genome" + shell <- .pick_shell(image) + mount <- .docker_path(out_dir) stderr_file <- file.path(out_dir, paste0("gff_chunk_", tag, ".stderr.txt")) # GFFs from BV-BRC .gto export are not directly compatible in Panaroo # This block reformats the contig IDs and ensures .gff/.fna pairs work together sh_cmd <- paste0( - 'set -euo pipefail; ', - 'fail_n=0; : > /out/', basename(stderr_file), '; ', + "set -euo pipefail; ", + "fail_n=0; : > /out/", basename(stderr_file), "; ", 'while IFS= read -r b || [ -n "$b" ]; do ', ' b=${b%$\'\\r\'}; [ -n "$b" ] || continue; ', ' gto="/out/${b}.gto"; gff="/out/${b}.PATRIC.gff"; map="/out/${b}.orig2id.tsv"; ', - ' if [ ! -s "$gto" ]; then echo "MISSING_GTO $b" >>/out/', basename(stderr_file), '; continue; fi; ', + ' if [ ! -s "$gto" ]; then echo "MISSING_GTO $b" >>/out/', basename(stderr_file), "; continue; fi; ", # Export GFF ' [ -s "$gff" ] || ', exporter, ' -i "$gto" -o "$gff" gff ', - ' || { echo "EXPORT_FAIL_GFF $b" >>/out/', basename(stderr_file), '; fail_n=$((fail_n+1)); continue; }; ', + ' || { echo "EXPORT_FAIL_GFF $b" >>/out/', basename(stderr_file), "; fail_n=$((fail_n+1)); continue; }; ", # Build mapping original_id -> id, and default to id if original_id missing - ' if command -v jq >/dev/null 2>&1; then ', + " if command -v jq >/dev/null 2>&1; then ", ' jq -r \'.contigs[] | [(.original_id // .id), .id] | @tsv\' "$gto" > "$map"; ', - ' else ', + " else ", ' python3 - "$gto" > "$map" <<\'PY\'\n', - 'import sys, json\n', - 'g = json.load(open(sys.argv[1]))\n', + "import sys, json\n", + "g = json.load(open(sys.argv[1]))\n", 'for c in g.get("contigs", []):\n', ' o = c.get("original_id") or c.get("id")\n', ' i = c.get("id")\n', - ' if o and i:\n', + " if o and i:\n", ' print(f"{o}\\t{i}")\n', - 'PY\n', - ' fi; ', + "PY\n", + " fi; ", # Relabel GFF sequence IDs: original_id -> id - ' awk \'FNR==NR{m[$1]=$2; next} ', - ' /^##sequence-region/ { if ($2 in m) {$2=m[$2]} print; next } ', - ' /^#/ { print; next } ', + " awk 'FNR==NR{m[$1]=$2; next} ", + " /^##sequence-region/ { if ($2 in m) {$2=m[$2]} print; next } ", + " /^#/ { print; next } ", ' { if ($1 in m) $1=m[$1]; print }\' "$map" "$gff" > "${gff}.tmp" && mv "${gff}.tmp" "$gff"; ', - - 'done < /out/', basename(ids_file), '; ', - 'exit 0' + "done < /out/", basename(ids_file), "; ", + "exit 0" ) args <- c("run", "--rm", "-v", paste0(mount, ":/out"), image, shell, "-lc", shQuote(sh_cmd)) - res <- suppressWarnings(system2("docker", args = args, stdout = TRUE, stderr = TRUE)) - st <- attr(res, "status"); if (is.null(st)) st <- 0L + res <- suppressWarnings(system2("docker", args = args, stdout = TRUE, stderr = TRUE)) + st <- attr(res, "status") + if (is.null(st)) st <- 0L if (st != 0L && tries > 1L) { Sys.sleep(1) @@ -1290,14 +1349,15 @@ retrieveGenomes <- function(base_dir = ".", cli_gff_workers = 4L, chunk_size = 50L, verbose = TRUE) { - - method <- match.arg(method) + method <- match.arg(method) base_dir <- normalizePath(base_dir, mustWork = FALSE) # IDs from .filterGenomes() if (isTRUE(verbose)) message("Filtering genomes before download.") f_out <- .filterGenomes(base_dir = base_dir, user_bacs = user_bacs, verbose = verbose) - if (is.null(f_out)) return(character()) + if (is.null(f_out)) { + return(character()) + } con <- f_out$duckdbConnection tbl <- f_out$table_name @@ -1306,12 +1366,12 @@ retrieveGenomes <- function(base_dir = ".", dplyr::pull(`genome.genome_id`) # Set up the paths and such - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - bug_dir <- dirname(paths$db_path) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + bug_dir <- dirname(paths$db_path) genome_path <- file.path(bug_dir, "genomes") - logs_dir <- file.path(base_dir, "data", "logs") + logs_dir <- file.path(base_dir, "data", "logs") dir.create(genome_path, recursive = TRUE, showWarnings = FALSE) - dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) + dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) # Adding support for resuming a download if (isTRUE(skip_existing)) { @@ -1322,10 +1382,12 @@ retrieveGenomes <- function(base_dir = ".", if (length(ids) == 0L) { if (isTRUE(verbose)) message("All genomes already complete.") - all_complete <- .list_complete(genome_path, - tibble::as_tibble(DBI::dbReadTable(con, tbl)) |> - dplyr::distinct(`genome.genome_id`) |> - dplyr::pull(`genome.genome_id`)) + all_complete <- .list_complete( + genome_path, + tibble::as_tibble(DBI::dbReadTable(con, tbl)) |> + dplyr::distinct(`genome.genome_id`) |> + dplyr::pull(`genome.genome_id`) + ) return(all_complete) } @@ -1334,18 +1396,23 @@ retrieveGenomes <- function(base_dir = ".", if (isTRUE(verbose)) message("Trying FTPS download. Workers=", ftp_workers) future::plan(future::multisession, workers = max(1, ftp_workers)) ft_ok <- future.apply::future_lapply(ids, function(gid) .ftp_download_one(gid, genome_path), - future.seed = TRUE) + future.seed = TRUE + ) ok_ids <- ids[unlist(ft_ok)] if (isTRUE(verbose)) message("Complete file sets for ", length(ok_ids), " genomes (FTP).") return(c(ok_ids, .list_complete(genome_path, setdiff(ids, ok_ids)))) } # CLI for FASTA, FAA, and GTO, then GFF from GTO - chunks <- split(ids, ceiling(seq_along(ids)/chunk_size)) + chunks <- split(ids, ceiling(seq_along(ids) / chunk_size)) # Parallel chunk containers - if (isTRUE(verbose)) message("CLI being run in parallel for ", length(chunks), - " data chunks.") + if (isTRUE(verbose)) { + message( + "CLI being run in parallel for ", length(chunks), + " data chunks." + ) + } future::plan(future::multisession, workers = max(1, cli_fasta_workers)) fa_res <- future.apply::future_mapply( FUN = function(vec, tag) .cli_dump_fastas_gto_chunk(image, genome_path, vec, tag), @@ -1355,8 +1422,12 @@ retrieveGenomes <- function(base_dir = ".", if (!all(fa_res) && isTRUE(verbose)) warning(sum(!fa_res), " data chunks failed.") # GFF extraction in parallel containers - if (isTRUE(verbose)) message("GFF extraction being run in parallel for ", - length(chunks), " data chunks.") + if (isTRUE(verbose)) { + message( + "GFF extraction being run in parallel for ", + length(chunks), " data chunks." + ) + } future::plan(future::multisession, workers = max(1, cli_gff_workers)) g_res <- future.apply::future_mapply( FUN = function(vec, tag) .cli_export_gff_chunk(image, genome_path, vec, tag), @@ -1367,8 +1438,12 @@ retrieveGenomes <- function(base_dir = ".", # Success set: .fna + .PATRIC.faa + .PATRIC.gff all present per isolate ok_ids <- ids[vapply(ids, .is_complete_set, logical(1), dir = genome_path)] - if (isTRUE(verbose)) message("Complete file sets downloaded for ", - length(ok_ids), " genomes.") + if (isTRUE(verbose)) { + message( + "Complete file sets downloaded for ", + length(ok_ids), " genomes." + ) + } ok_ids } @@ -1387,15 +1462,14 @@ retrieveGenomes <- function(base_dir = ".", genomeList <- function(base_dir = ".", user_bacs, verbose = TRUE) { - base_dir <- normalizePath(base_dir, mustWork = FALSE) - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - db_path <- paths$db_path - bug_dir <- dirname(db_path) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + db_path <- paths$db_path + bug_dir <- dirname(db_path) genome_path <- file.path(bug_dir, "genomes") - files_all <- list.files(genome_path, full.names = TRUE) - files_all <- files_all[file.info(files_all)$size > 100] + files_all <- list.files(genome_path, full.names = TRUE) + files_all <- files_all[file.info(files_all)$size > 100] # Separate by type gff_files <- files_all[grepl("\\.PATRIC\\.gff$", files_all)] @@ -1414,14 +1488,18 @@ genomeList <- function(base_dir = ".", faa_path <- file.path(genome_path, paste0(genomeID, ".PATRIC.faa")) data.frame( - genome_id = genomeID, - gff_path = if (file.exists(gff_path) && file.info(gff_path)$size > 100) gff_path else NA, - fna_path = if (file.exists(fna_path) && file.info(fna_path)$size > 100) fna_path else NA, - faa_path = if (file.exists(faa_path) && file.info(faa_path)$size > 100) faa_path else NA, + genome_id = genomeID, + gff_path = if (file.exists(gff_path) && file.info(gff_path)$size > 100) gff_path else NA, + fna_path = if (file.exists(fna_path) && file.info(fna_path)$size > 100) fna_path else NA, + faa_path = if (file.exists(faa_path) && file.info(faa_path)$size > 100) faa_path else NA, panaroo_input = if ( file.exists(gff_path) && file.exists(fna_path) && file.exists(faa_path) && - file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100 - ) paste(gff_path, fna_path) else NA, + file.info(gff_path)$size > 100 && file.info(fna_path)$size > 100 && file.info(faa_path)$size > 100 + ) { + paste(gff_path, fna_path) + } else { + NA + }, stringsAsFactors = FALSE ) }) @@ -1480,7 +1558,7 @@ prepareGenomes <- function(user_bacs, method = c("ftp", "cli"), overwrite = FALSE, verbose = TRUE) { - method <- match.arg(method) + method <- match.arg(method) base_dir <- normalizePath(base_dir, mustWork = FALSE) # Ensure the BV-BRC metadata cache exists, fetch if missing diff --git a/R/data_processing.R b/R/data_processing.R index 78ab404..8f31622 100644 --- a/R/data_processing.R +++ b/R/data_processing.R @@ -81,7 +81,7 @@ # Convert to container-visible paths genome_filepath_cont <- .to_container(genome_filepath_host, host_root = mount_host, container_root = mount_cont) - output_dir_cont <- .to_container(output_dir_host, host_root = mount_host, container_root = mount_cont) + output_dir_cont <- .to_container(output_dir_host, host_root = mount_host, container_root = mount_cont) # Run Panaroo in Docker cmd_args <- c( @@ -99,14 +99,17 @@ "--remove-invalid-genes", "--core_threshold", as.character(core_threshold), "--len_dif_percent", as.character(len_dif_percent), - "--threshold", as.character(cluster_threshold), - "-f", as.character(family_seq_identity), - "-t", as.character(panaroo_threads_per_job) + "--threshold", as.character(cluster_threshold), + "-f", as.character(family_seq_identity), + "-t", as.character(panaroo_threads_per_job) ) - res <- tryCatch({ - system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE) - }, error = function(e) e) + res <- tryCatch( + { + system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE) + }, + error = function(e) e + ) if (inherits(res, "error")) { stop(sprintf("Docker/Panaroo failed to launch: %s", res$message)) @@ -179,7 +182,6 @@ family_seq_identity = 0.5, threads = 8, split_jobs = FALSE) { - duckdb_path <- normalizePath(duckdb_path) con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) @@ -213,7 +215,7 @@ filtered_panaroo_input <- sapply(split_files[unlist(valid_entries)], paste, collapse = " ") total_lines <- length(filtered_panaroo_input) - batch_size <- if (isTRUE(split_jobs)) ceiling(total_lines / 5) else total_lines + batch_size <- if (isTRUE(split_jobs)) ceiling(total_lines / 5) else total_lines panaroo_batches <- split(filtered_panaroo_input, ceiling(seq_along(filtered_panaroo_input) / batch_size)) n_jobs <- length(panaroo_batches) @@ -265,8 +267,7 @@ len_dif_percent = 0.95, cluster_threshold = 0.95, family_seq_identity = 0.5, - threads = 8){ - + threads = 8) { input_path <- .docker_path(input_path) # Fail fast if Docker is missing @@ -302,9 +303,9 @@ "--merge_paralogs", "--core_threshold", as.character(core_threshold), "--len_dif_percent", as.character(len_dif_percent), - "--threshold", as.character(cluster_threshold), - "-f", as.character(family_seq_identity), - "-t", as.character(threads) + "--threshold", as.character(cluster_threshold), + "-f", as.character(family_seq_identity), + "-t", as.character(threads) ) system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE) @@ -314,7 +315,6 @@ } - #' Load Panaroo gene presence/absence table into DuckDB #' #' Reads `gene_presence_absence.csv` and constructs a genome-by-gene count @@ -326,13 +326,13 @@ #' @return A tibble containing the gene count matrix. #' #' @keywords internal -.panaroo2geneTable <- function(panaroo_output_path, duckdb_path){ +.panaroo2geneTable <- function(panaroo_output_path, duckdb_path) { filepath <- file.path(normalizePath(panaroo_output_path), "gene_presence_absence.csv") duckdb_path <- normalizePath(duckdb_path) con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) - gene_count <- read.table(filepath, sep=",", header=TRUE, fill=TRUE, quote="") |> + gene_count <- read.table(filepath, sep = ",", header = TRUE, fill = TRUE, quote = "") |> tibble::as_tibble() |> dplyr::select(-c(Non.unique.Gene.name, Annotation)) |> tidyr::pivot_longer(cols = -1) |> @@ -356,13 +356,13 @@ #' @return A tibble with `Gene` and `Annotation` columns. #' #' @keywords internal -.panaroo2geneNames <- function(panaroo_output_path, duckdb_path){ +.panaroo2geneNames <- function(panaroo_output_path, duckdb_path) { filepath <- file.path(normalizePath(panaroo_output_path), "gene_presence_absence.csv") duckdb_path <- normalizePath(duckdb_path) con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) - gene_names <- read.table(filepath, sep=",", header=TRUE, fill=TRUE, quote="") |> + gene_names <- read.table(filepath, sep = ",", header = TRUE, fill = TRUE, quote = "") |> tibble::as_tibble() |> dplyr::select(c(Gene, Annotation)) @@ -381,15 +381,15 @@ #' @return A tibble containing the struct matrix. #' #' @keywords internal -.panaroo2StructTable <- function(panaroo_output_path, duckdb_path){ +.panaroo2StructTable <- function(panaroo_output_path, duckdb_path) { struct_filepath <- file.path(normalizePath(panaroo_output_path), "struct_presence_absence.Rtab") duckdb_path <- normalizePath(duckdb_path) con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) - gene_struct <- read.table(struct_filepath, sep="\t", header=TRUE, fill=TRUE, quote="") |> + gene_struct <- read.table(struct_filepath, sep = "\t", header = TRUE, fill = TRUE, quote = "") |> tibble::as_tibble() |> - tidyr::pivot_longer(cols= -1) |> + tidyr::pivot_longer(cols = -1) |> tidyr::pivot_wider(names_from = Gene, values_from = value) |> dplyr::rename("genome_id" = "name") |> dplyr::mutate(genome_id = stringr::str_replace_all(genome_id, c("^X" = "", "\\.PATRIC$" = ""))) @@ -409,7 +409,7 @@ #' @return Invisibly returns TRUE. #' #' @keywords internal -.panaroo2OtherTables <- function(panaroo_output_path, duckdb_path){ +.panaroo2OtherTables <- function(panaroo_output_path, duckdb_path) { panaroo_output_path <- normalizePath(panaroo_output_path) duckdb_path <- normalizePath(duckdb_path) fasta_filepath <- file.path(panaroo_output_path, "pan_genome_reference.fa") @@ -418,15 +418,19 @@ gene_fasta <- Biostrings::readDNAStringSet(filepath = fasta_filepath) DBI::dbWriteTable(con, "gene_ref_seq", - tibble::tibble(name = names(gene_fasta), - sequence = as.character(gene_fasta)), - overwrite = TRUE) + tibble::tibble( + name = names(gene_fasta), + sequence = as.character(gene_fasta) + ), + overwrite = TRUE + ) readr::read_csv(file.path(panaroo_output_path, "gene_presence_absence.csv")) |> dplyr::select(-`Non-unique Gene name`) |> tidyr::pivot_longer(-c("Gene", "Annotation"), - names_to = "genome_ids", - values_to = "protein_ids") |> + names_to = "genome_ids", + values_to = "protein_ids" + ) |> dplyr::mutate(genome_ids = gsub(".PATRIC", "", genome_ids)) |> dplyr::select(genome_ids, Gene, protein_ids) |> dplyr::distinct() |> @@ -447,9 +451,9 @@ #' @return Invisibly returns TRUE. #' #' @keywords internal -.panaroo2duckdb <- function(panaroo_output_path, duckdb_path){ +.panaroo2duckdb <- function(panaroo_output_path, duckdb_path) { panaroo_output_path <- normalizePath(panaroo_output_path) - duckdb_path <- normalizePath(duckdb_path) + duckdb_path <- normalizePath(duckdb_path) .panaroo2geneTable(panaroo_output_path, duckdb_path) .panaroo2geneNames(panaroo_output_path, duckdb_path) @@ -484,7 +488,6 @@ threads = 0, memory = 0, extra_args = c("-g", "1")) { - # Fail fast if Docker is missing if (!nzchar(Sys.which("docker"))) { stop("Docker is not available on your PATH but is required to run CD-HIT.") @@ -530,7 +533,7 @@ "weizhongli1987/cdhit:4.8.1", "cd-hit", "-i", .to_container(cdhit_input_faa, mount_host, mount_cont), - "-o", .to_container(clustered_faa, mount_host, mount_cont), + "-o", .to_container(clustered_faa, mount_host, mount_cont), "-c", as.character(identity), "-n", as.character(word_length), "-T", as.character(threads), @@ -540,19 +543,24 @@ ) message("Running cd-hit via Docker...") - output <- tryCatch({ - system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE) - }, error = function(e) { - stop("cd-hit execution failed: ", e$message) - }) + output <- tryCatch( + { + system2("docker", args = cmd_args, stdout = TRUE, stderr = TRUE) + }, + error = function(e) { + stop("cd-hit execution failed: ", e$message) + } + ) if (!file.exists(clustered_faa)) { stop("cd-hit failed: output file not found. Check stderr:\n", paste(output, collapse = "\n")) } # Ensure .clstr exists (used downstream) if (!file.exists(paste0(clustered_faa, ".clstr"))) { - stop("cd-hit did not produce the expected .clstr file at: ", paste0(clustered_faa, ".clstr"), - "\nFull output:\n", paste(output, collapse = "\n")) + stop( + "cd-hit did not produce the expected .clstr file at: ", paste0(clustered_faa, ".clstr"), + "\nFull output:\n", paste(output, collapse = "\n") + ) } message("cd-hit completed successfully.") @@ -672,14 +680,14 @@ runPanaroo2Duckdb <- function(duckdb_path, if (isTRUE(verbose)) message("Launching Panaroo.") .runPanaroo( - duckdb_path = duckdb_path, - output_path = out_dir, - core_threshold = core_threshold, - len_dif_percent = len_dif_percent, + duckdb_path = duckdb_path, + output_path = out_dir, + core_threshold = core_threshold, + len_dif_percent = len_dif_percent, cluster_threshold = cluster_threshold, family_seq_identity = family_seq_identity, - threads = threads, - split_jobs = split_jobs + threads = threads, + split_jobs = split_jobs ) # Identify Panaroo outputs that contain a final_graph.gml file @@ -730,8 +738,10 @@ runPanaroo2Duckdb <- function(duckdb_path, .parseProteinClusters <- function(clustered_faa) { clstr <- paste0(clustered_faa, ".clstr") if (!file.exists(clstr)) { - stop("CD-HIT cluster file not found: ", clstr, - "\nEnsure .runCDHIT() completed successfully and produced the .clstr file.") + stop( + "CD-HIT cluster file not found: ", clstr, + "\nEnsure .runCDHIT() completed successfully and produced the .clstr file." + ) } lines <- data.table::fread(clstr, sep = "\n", header = FALSE)$V1 @@ -740,20 +750,22 @@ runPanaroo2Duckdb <- function(duckdb_path, for (i in seq_along(cluster_ids)) { start <- cluster_ids[i] + 1 - end <- if (i < length(cluster_ids)) cluster_ids[i + 1] - 1 else length(lines) + end <- if (i < length(cluster_ids)) cluster_ids[i + 1] - 1 else length(lines) cluster_lines <- lines[start:end] # This finds the reference cluster ID and names the cluster with it ref_line <- grep("\\*$", cluster_lines, value = TRUE) ref_id <- if (length(ref_line) > 0) { - stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+") + stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+") } else { paste0("Cluster_", i - 1) } # Pull genome IDs - genome_matches <- stringr::str_match(cluster_lines, - "fig\\|([0-9]+\\.[0-9]+)\\.peg\\.[0-9]+")[, 2] + genome_matches <- stringr::str_match( + cluster_lines, + "fig\\|([0-9]+\\.[0-9]+)\\.peg(?:sc)?\\.[0-9]+" + )[, 2] genome_matches <- genome_matches[!is.na(genome_matches)] if (length(genome_matches) > 0) { @@ -810,9 +822,9 @@ buildMatrices <- function(cluster_map) .buildProtMatrices(cluster_map) names_faa <- names(cdhit_output_faa) |> tibble::as_tibble() |> dplyr::mutate( - proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"), - locus_tag = stringr::str_match(value, "peg\\.[0-9]+\\|([^\\s]+)")[, 2], - proteinName= stringr::str_trim(stringr::str_match(value, "\\|[^\\s]+\\s+(.*?)\\s+\\[")[, 2]) + proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+"), + locus_tag = stringr::str_match(value, "peg(?:sc)?\\.[0-9]+\\|([^\\s]+)")[, 2], + proteinName = stringr::str_trim(stringr::str_match(value, "\\|[^\\s]+\\s+(.*?)\\s+\\[")[, 2]) ) |> dplyr::select(-value) @@ -828,47 +840,47 @@ CDHIT2duckdb <- function(duckdb_path, word_length = 5, threads = 0, memory = 0, - extra_args = c("-g", "1")){ - + extra_args = c("-g", "1")) { duckdb_path <- normalizePath(duckdb_path) con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) if (missing(output_path) || output_path %in% c(".", "results", "results/")) { - output_path <- dirname(duckdb_path) # e.g., ./results/ + output_path <- dirname(duckdb_path) # e.g., ./results/ } output_path <- normalizePath(output_path) cdhit_outputs <- .runCDHIT(duckdb_path, - output_path, - output_prefix = output_prefix, - identity = identity, - word_length = word_length, - threads = threads, - memory = memory, - extra_args = extra_args) - - cluster_map <- .parseProteinClusters(cdhit_outputs$clustered_faa) + output_path, + output_prefix = output_prefix, + identity = identity, + word_length = word_length, + threads = threads, + memory = memory, + extra_args = extra_args + ) + + cluster_map <- .parseProteinClusters(cdhit_outputs$clustered_faa) cluster_count <- .buildProtMatrices(cluster_map) DBI::dbWriteTable(con, "protein_count", cluster_count, overwrite = TRUE) cluster_fasta <- cdhit_outputs$cdhit_input_faa - cluster_name <- .clusterNames(cluster_map, cluster_fasta) + cluster_name <- .clusterNames(cluster_map, cluster_fasta) DBI::dbWriteTable(con, "protein_names", cluster_name, overwrite = TRUE) clustered_faa <- Biostrings::readAAStringSet(cdhit_outputs$clustered_faa) DBI::dbWriteTable(con, "protein_cluster_seq", - tibble::tibble( - name = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"), - sequence = as.character(clustered_faa) - ), - overwrite = TRUE) + tibble::tibble( + name = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+"), + sequence = as.character(clustered_faa) + ), + overwrite = TRUE + ) invisible(TRUE) } - #' Check or install InterProScan data bundle #' #' Ensures that the InterProScan data directory exists locally, downloading @@ -885,12 +897,12 @@ CDHIT2duckdb <- function(duckdb_path, #' #' @keywords internal .checkInterProData <- function( - version = "5.76-107.0", - dest_dir = "inst/extdata/interpro", - docker_image = sprintf("interpro/interproscan:%s", version), - platform = "linux/amd64", - curl_bin = "curl", - verbose = TRUE + version = "5.76-107.0", + dest_dir = "inst/extdata/interpro", + docker_image = sprintf("interpro/interproscan:%s", version), + platform = "linux/amd64", + curl_bin = "curl", + verbose = TRUE ) { msg <- function(...) if (verbose) message(sprintf(...)) @@ -907,25 +919,29 @@ CDHIT2duckdb <- function(duckdb_path, } # Download bundle if needed - tar_url <- sprintf("http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/%s/alt/interproscan-data-%s.tar.gz", - version, version) - md5_url <- paste0(tar_url, ".md5") + tar_url <- sprintf( + "http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/%s/alt/interproscan-data-%s.tar.gz", + version, version + ) + md5_url <- paste0(tar_url, ".md5") tar_path <- file.path(dest_dir, basename(tar_url)) md5_path <- paste0(tar_path, ".md5") if (!file.exists(tar_path)) { msg("Downloading InterProScan data bundle.") - status_tar <- system2(curl_bin, c("-L", "-o", tar_path, tar_url)) - status_md5 <- system2(curl_bin, c("-L", "-o", md5_path, md5_url)) - if (status_tar != 0 || status_md5 != 0) + status_tar <- system2(curl_bin, c("-L", "-o", tar_path, tar_url)) + status_md5 <- system2(curl_bin, c("-L", "-o", md5_path, md5_url)) + if (status_tar != 0 || status_md5 != 0) { stop("Failed to download InterProScan data bundle.") + } } msg("Verifying MD5 checksum.") md5_expected <- sub("\\s+.*$", "", readLines(md5_path)[1]) - md5_actual <- tools::md5sum(tar_path)[[1]] - if (!identical(tolower(md5_expected), tolower(md5_actual))) + md5_actual <- tools::md5sum(tar_path)[[1]] + if (!identical(tolower(md5_expected), tolower(md5_actual))) { stop("MD5 checksum mismatch for InterProScan data bundle.") + } msg("Extracting InterProScan data bundle.") utils::untar(tar_path, exdir = dest_dir, tar = "internal") @@ -946,9 +962,11 @@ CDHIT2duckdb <- function(duckdb_path, #' #' @keywords internal .getDfIPRColNames <- function() { - c("AccNum", "SeqMD5Digest", "SLength", "Analysis", + c( + "AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score", - "Status", "RunDate", "IPRAcc", "IPRDesc", "placeholder") + "Status", "RunDate", "IPRAcc", "IPRDesc", "placeholder" + ) } #' Internal helpers for reading InterProScan TSV outputs @@ -993,8 +1011,9 @@ CDHIT2duckdb <- function(duckdb_path, #' @keywords internal .readIPRscanTsv <- function(filepath) { readr::read_tsv(filepath, - col_types = .getDfIPRColTypes(), - col_names = .getDfIPRColNames()) + col_types = .getDfIPRColTypes(), + col_names = .getDfIPRColNames() + ) } @@ -1026,7 +1045,7 @@ CDHIT2duckdb <- function(duckdb_path, file_format, docker_image = sprintf("interpro/interproscan:%s", "5.76-107.0")) { # Normalize and mount paths - path <- .docker_path(path) + path <- .docker_path(path) bind_data <- .docker_path(ipr_data_path) dir.create(file.path(path, "tmp", "iprscan"), recursive = TRUE, showWarnings = FALSE) @@ -1050,7 +1069,7 @@ CDHIT2duckdb <- function(duckdb_path, "-v", paste0(bind_data, ":/opt/interproscan/data"), "-w", "/work", docker_image, - "--input", .to_container(temp_fasta_file, path, "/work"), + "--input", .to_container(temp_fasta_file, path, "/work"), "--cpu", as.character(threads), "-f", file_format, "--appl", appl_str, @@ -1058,31 +1077,34 @@ CDHIT2duckdb <- function(duckdb_path, ) - status <- tryCatch({ - system2( - "docker", - args = c( - "run", - "--rm", - "--platform", "linux/amd64", # force amd64 for ARM hosts - "-v", paste0(path, ":", "/work"), - "-v", paste0(bind_data, ":/opt/interproscan/data"), - "-w", "/work", - docker_image, - "--input", .to_container(temp_fasta_file, path, "/work"), - "--cpu", as.character(threads), - "-f", file_format, - "--appl", appl_str, - "-b", chunk_out_file_base_cont - ), - stdout = TRUE, - stderr = TRUE - ) - }, error = function(e) { - stop(sprintf("InterProScan execution failed for chunk %d: %s", chunk_id, e$message)) - }) + status <- tryCatch( + { + system2( + "docker", + args = c( + "run", + "--rm", + "--platform", "linux/amd64", # force amd64 for ARM hosts + "-v", paste0(path, ":", "/work"), + "-v", paste0(bind_data, ":/opt/interproscan/data"), + "-w", "/work", + docker_image, + "--input", .to_container(temp_fasta_file, path, "/work"), + "--cpu", as.character(threads), + "-f", file_format, + "--appl", appl_str, + "-b", chunk_out_file_base_cont + ), + stdout = TRUE, + stderr = TRUE + ) + }, + error = function(e) { + stop(sprintf("InterProScan execution failed for chunk %d: %s", chunk_id, e$message)) + } + ) - out_tsv <- paste0(chunk_out_file_base_host, ".tsv") + out_tsv <- paste0(chunk_out_file_base_host, ".tsv") out_tsvgz <- paste0(chunk_out_file_base_host, ".tsv.gz") if (file.exists(out_tsv)) { @@ -1090,8 +1112,10 @@ CDHIT2duckdb <- function(duckdb_path, } else if (file.exists(out_tsvgz)) { return(out_tsvgz) } else { - stop(sprintf("InterProScan produced no output for chunk %d. Checked: %s and %s.\nLast message:\n%s", - chunk_id, out_tsv, out_tsvgz, paste(status, collapse = "\n"))) + stop(sprintf( + "InterProScan produced no output for chunk %d. Checked: %s and %s.\nLast message:\n%s", + chunk_id, out_tsv, out_tsvgz, paste(status, collapse = "\n") + )) } } @@ -1100,14 +1124,13 @@ domainFromIPR <- function(duckdb_path, path, out_file_base = "iprscan", appl = c("Pfam"), - ipr_version = "5.76-107.0", + ipr_version = "5.76-107.0", ipr_dest_dir = "inst/extdata/interpro", ipr_platform = "linux/amd64", auto_prepare_data = TRUE, threads = 8, file_format = "TSV", docker_repo = "interpro/interproscan") { - duckdb_path <- normalizePath(duckdb_path) if (missing(path) || path %in% c(".", "results", "results/")) { path <- dirname(duckdb_path) @@ -1126,8 +1149,10 @@ domainFromIPR <- function(duckdb_path, verbose = TRUE ) } else { - list(data_dir = file.path(ipr_dest_dir, sprintf("interproscan-%s", ipr_version), "data"), - ready = NA) + list( + data_dir = file.path(ipr_dest_dir, sprintf("interproscan-%s", ipr_version), "data"), + ready = NA + ) } ipr_data_path <- ipr_info$data_dir @@ -1138,11 +1163,12 @@ domainFromIPR <- function(duckdb_path, on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) sequences_df <- dplyr::tbl(con, "protein_cluster_seq") |> tibble::as_tibble() - if (nrow(sequences_df) == 0L) + if (nrow(sequences_df) == 0L) { stop("No sequences found in 'protein_cluster_seq'. Please run CDHIT2duckdb() first.") + } # Chunking for parallel (not currently implemented due to memory limits) - chunks <- list(sequences_df) # Force 1 chunk for RAM limits + chunks <- list(sequences_df) # Force 1 chunk for RAM limits # Forcing 1 container operation for RAM limits workers <- 1 @@ -1179,24 +1205,28 @@ domainFromIPR <- function(duckdb_path, # Combine results tsvs <- Filter(function(x) !is.null(x) && file.exists(x), results) - if (length(tsvs) == 0L) + if (length(tsvs) == 0L) { stop("InterProScan produced no usable outputs. Check Docker logs above.") + } df_iprscan <- do.call(rbind, lapply(tsvs, .readIPRscanTsv)) # Load processed tables (unchanged) DBI::dbWriteTable(con, "domain_names", - df_iprscan |> - dplyr::select(AccNum, DB.ID, SignDesc, IPRAcc, IPRDesc, StartLoc, StopLoc), - overwrite = TRUE) + df_iprscan |> + dplyr::select(AccNum, DB.ID, SignDesc, IPRAcc, IPRDesc, StartLoc, StopLoc), + overwrite = TRUE + ) df_protein_domain_pa <- df_iprscan |> dplyr::select(AccNum, DB.ID, IPRAcc, placeholder) |> dplyr::mutate(domain_ID = stringr::str_glue("{DB.ID}_{IPRAcc}")) |> dplyr::distinct() |> dplyr::mutate(placeholder = stringr::str_replace_all(placeholder, "-", "1")) |> - tidyr::pivot_wider(id_cols = AccNum, names_from = domain_ID, values_from = placeholder, - values_fill = "0") |> + tidyr::pivot_wider( + id_cols = AccNum, names_from = domain_ID, values_from = placeholder, + values_fill = "0" + ) |> dplyr::group_by(AccNum) |> dplyr::summarize(across(everything(), ~ ifelse(any(. == "1"), "1", "0")), .groups = "drop") |> dplyr::mutate(across(-AccNum, as.numeric)) @@ -1204,8 +1234,9 @@ domainFromIPR <- function(duckdb_path, protein_filter <- dplyr::tbl(con, "protein_count") |> tibble::as_tibble() accs <- unique(df_protein_domain_pa$AccNum) accs_in_matrix <- intersect(accs, colnames(protein_filter)) - if (length(accs_in_matrix) == 0L) + if (length(accs_in_matrix) == 0L) { stop("No InterPro accessions match protein_count columns.") + } protein_filter <- protein_filter |> dplyr::select(genome_id, dplyr::all_of(accs_in_matrix)) df_protein_domain_pa <- df_protein_domain_pa |> @@ -1223,8 +1254,8 @@ domainFromIPR <- function(duckdb_path, } # Clean BV-BRC metadata, then save as Parquet files -cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ - duckdb_path <- normalizePath(duckdb_path) +cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { + duckdb_path <- normalizePath(duckdb_path) # If no explicit path is provided (or a generic one), choose results// when # the DuckDB lives under data//, or else fall back to the DuckDB directory. if (missing(path) || path %in% c(".", "results", "results/")) { @@ -1247,20 +1278,22 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ clean_drug <- readr::read_tsv(file.path(ref_file_path, "clean_drug.tsv")) drug_class <- readr::read_tsv(file.path(ref_file_path, "drug_class.tsv")) - drug_abbr <- readr::read_tsv(file.path(ref_file_path, "drug_abbr.tsv")) + drug_abbr <- readr::read_tsv(file.path(ref_file_path, "drug_abbr.tsv")) class_abbr <- readr::read_tsv(file.path(ref_file_path, "class_abbr.tsv")) clean_countries <- readr::read_tsv(file.path(ref_file_path, "cleaned_bvbrc_countries.tsv")) |> - dplyr::select("raw_entry", "clean_name", "short_name")|> + dplyr::select("raw_entry", "clean_name", "short_name") |> dplyr::distinct() dplyr::tbl(con, "filtered") |> tibble::as_tibble() |> - dplyr::select("genome_drug.genome_id", "genome_drug.antibiotic", - "genome_drug.genome_name", "genome_drug.laboratory_typing_method", - "genome_drug.resistant_phenotype", "genome_drug.taxon_id", - "genome_drug.pmid", "genome.collection_year", - "genome.isolation_country", "genome.host_common_name", - "genome.isolation_source", "genome.species") |> + dplyr::select( + "genome_drug.genome_id", "genome_drug.antibiotic", + "genome_drug.genome_name", "genome_drug.laboratory_typing_method", + "genome_drug.resistant_phenotype", "genome_drug.taxon_id", + "genome_drug.pmid", "genome.collection_year", + "genome.isolation_country", "genome.host_common_name", + "genome.isolation_source", "genome.species" + ) |> dplyr::left_join(clean_drug, by = c("genome_drug.antibiotic" = "original_drug")) |> dplyr::filter(!is.na(cleaned_drug)) |> dplyr::left_join(drug_class, by = c("cleaned_drug" = "drug")) |> @@ -1269,7 +1302,7 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ DBI::dbWriteTable(conn = con, name = "filtered", overwrite = TRUE) resistance_summary <- dplyr::tbl(con, "filtered") |> - tibble::as_tibble() |> + tibble::as_tibble() |> dplyr::filter(genome_drug.resistant_phenotype == "Resistant") |> dplyr::group_by(genome_drug.genome_id) |> dplyr::summarise( @@ -1283,7 +1316,7 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ dplyr::mutate(genome_drug.antibiotic = cleaned_drug) |> dplyr::select(-cleaned_drug) |> dplyr::left_join(clean_countries, by = c("genome.isolation_country" = "raw_entry")) |> - dplyr::rename("cleaned_country"="clean_name", "country_abbr"="short_name") |> + dplyr::rename("cleaned_country" = "clean_name", "country_abbr" = "short_name") |> dplyr::mutate(genome.isolation_country = cleaned_country) |> dplyr::select(-cleaned_country) |> dplyr::left_join(resistance_summary, by = "genome_drug.genome_id") |> @@ -1291,38 +1324,42 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ is.na(resistant_classes) ~ genome_drug.resistant_phenotype, TRUE ~ resistant_classes )) |> - dplyr::mutate(num_resistant_classes= dplyr::case_when( + dplyr::mutate(num_resistant_classes = dplyr::case_when( is.na(num_resistant_classes) ~ 0, TRUE ~ num_resistant_classes )) |> dplyr::mutate(genome.collection_year = as.numeric(genome.collection_year)) |> - dplyr::mutate(year_bin = cut(genome.collection_year, breaks = year_breaks, - right = FALSE, include.lowest = TRUE, - labels = paste(year_breaks[-length(year_breaks)], - year_breaks[-1] - 1, sep = "-"))) |> + dplyr::mutate(year_bin = cut(genome.collection_year, + breaks = year_breaks, + right = FALSE, include.lowest = TRUE, + labels = paste(year_breaks[-length(year_breaks)], + year_breaks[-1] - 1, + sep = "-" + ) + )) |> DBI::dbWriteTable(conn = con, name = "cleaned_metadata", overwrite = TRUE) # Parquet output paths - genes_parquet <- file.path(path, "gene_count.parquet") - gene_names_parquet <- file.path(path, "gene_names.parquet") - gene_ref_seq_parquet <- file.path(path, "gene_seqs.parquet") - genome_gene_protein_parquet <- file.path(path, "genome_gene_protein.parquet") - struct_parquet <- file.path(path, "struct.parquet") + genes_parquet <- file.path(path, "gene_count.parquet") + gene_names_parquet <- file.path(path, "gene_names.parquet") + gene_ref_seq_parquet <- file.path(path, "gene_seqs.parquet") + genome_gene_protein_parquet <- file.path(path, "genome_gene_protein.parquet") + struct_parquet <- file.path(path, "struct.parquet") - proteins_parquet <- file.path(path, "protein_count.parquet") - domains_parquet <- file.path(path, "domain_count.parquet") + proteins_parquet <- file.path(path, "protein_count.parquet") + domains_parquet <- file.path(path, "domain_count.parquet") - metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata' + metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata' - domain_names_parquet <- file.path(path, "domain_names.parquet") - protein_names_parquet <- file.path(path, "protein_names.parquet") + domain_names_parquet <- file.path(path, "domain_names.parquet") + protein_names_parquet <- file.path(path, "protein_names.parquet") - protein_cluster_seq_parquet <- file.path(path, "protein_seqs.parquet") + protein_cluster_seq_parquet <- file.path(path, "protein_seqs.parquet") # Also export AMR/genome/original metadata - amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet") - genome_data_parquet <- file.path(path, "genome_data.parquet") - original_metadata_parquet <- file.path(path, "original_metadata.parquet") + amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet") + genome_data_parquet <- file.path(path, "genome_data.parquet") + original_metadata_parquet <- file.path(path, "original_metadata.parquet") writeCompressedParquet <- function(df, path) { arrow::write_parquet( @@ -1334,7 +1371,9 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ ) } - db_name <- duckdb_path |> stringr::str_split_i(".duckdb", i = 1) |> paste0("_parquet.duckdb") + db_name <- duckdb_path |> + stringr::str_split_i(".duckdb", i = 1) |> + paste0("_parquet.duckdb") con_new <- DBI::dbConnect(duckdb::duckdb(), db_name) on.exit(try(DBI::dbDisconnect(con_new, shutdown = FALSE), silent = TRUE), add = TRUE) @@ -1399,8 +1438,8 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/"){ # debug/complete views: amr_phenotype, genome_data, original_metadata DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet) - DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet) - DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet) + DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet) + DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", amr_phenotype_parquet)) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", genome_data_parquet)) @@ -1565,7 +1604,7 @@ runDataProcessing <- function(duckdb_path, cdhit_identity = 0.9, cdhit_word_length = 5, cdhit_memory = 0, - cdhit_extra_args = c("-g","1"), + cdhit_extra_args = c("-g", "1"), cdhit_output_prefix = "cdhit_out", # InterPro ipr_appl = c("Pfam"), @@ -1651,4 +1690,3 @@ runDataProcessing <- function(duckdb_path, parquet_duckdb_path = normalizePath(parquet_duckdb_path) )) } - diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd index 59ccbc5..9d9a36f 100644 --- a/vignettes/intro.Rmd +++ b/vignettes/intro.Rmd @@ -31,13 +31,15 @@ For metadata curation with `amRdata`, use `retrieveMetadata()` to create a DuckD ```{r} # Download all AMR data for a species from BV-BRC -retrieveMetadata(user_bacs = "Shigella flexneri", - filter_type = "AMR", - base_dir = "../data/", - abx = "All", - overwrite = FALSE, - image = "danylmb/bvbrc:5.3", - verbose = FALSE) +retrieveMetadata( + user_bacs = "Shigella flexneri", + filter_type = "AMR", + base_dir = "../data/", + abx = "All", + overwrite = FALSE, + image = "danylmb/bvbrc:5.3", + verbose = FALSE +) ``` This wrote tables 'amr_phenotype', 'genome_data', and 'metadata' to a DuckDB @@ -45,16 +47,18 @@ This wrote tables 'amr_phenotype', 'genome_data', and 'metadata' to a DuckDB For downloading genomes with paired AMR phenotype data, use `retrieveGenomes()` ```{r } -retrieveGenomes(base_dir = "../data/", - user_bacs = "Shigella flexneri", - method = c("cli"), - image = "danylmb/bvbrc:5.3", - skip_existing = TRUE, - ftp_workers = 8L, - cli_fasta_workers = 4L, - cli_gff_workers = 4L, - chunk_size = 50L, - verbose = TRUE) +retrieveGenomes( + base_dir = "../data/", + user_bacs = "Shigella flexneri", + method = c("cli"), + image = "danylmb/bvbrc:5.3", + skip_existing = TRUE, + ftp_workers = 8L, + cli_fasta_workers = 4L, + cli_gff_workers = 4L, + chunk_size = 50L, + verbose = TRUE +) ``` This returns character vector of genome IDs and wrote complete file sets on disk. @@ -62,10 +66,11 @@ This returns character vector of genome IDs and wrote complete file sets on disk To write a .txt file listing downloaded genome filepaths (.fna, .faa, .gff) ```{r } -genomeList(base_dir = "../data/", - user_bacs = "Shigella flexneri", - verbose = TRUE) - +genomeList( + base_dir = "../data/", + user_bacs = "Shigella flexneri", + verbose = TRUE +) ``` #### A wrapper for downloading genomes and listing the paths @@ -77,11 +82,13 @@ Internally runs: Allows users to input a species or taxon ID and automate all data downloading and curation steps. ```{r } -prepareGenomes(user_bacs = "Shigella flexneri", - base_dir = "../data/", - method = c("cli"), - overwrite = FALSE, - verbose = TRUE) +prepareGenomes( + user_bacs = "Shigella flexneri", + base_dir = "../data/", + method = c("cli"), + overwrite = FALSE, + verbose = TRUE +) ``` ## Data Processing @@ -95,32 +102,34 @@ Internally runs: 4. `cleanData()` -\> Clean metadata and export the feature tables to Parquet + Parquet-backed DuckDB ```{r} -runDataProcessing(duckdb_path = "../data/Shigella_flexneri/Sfl.duckdb", - output_path = NULL, - # unified threads for all tools - threads = 16, - # Panaroo - panaroo_split_jobs = FALSE, - panaroo_core_threshold = 0.90, - panaroo_len_dif_percent = 0.95, - panaroo_cluster_threshold = 0.95, - panaroo_family_seq_identity = 0.5, - # CD-HIT - cdhit_identity = 0.9, - cdhit_word_length = 5, - cdhit_memory = 0, - cdhit_extra_args = c("-g","1"), - cdhit_output_prefix = "cdhit_out", - # InterPro - ipr_appl = c("Pfam"), - ipr_threads_unused = NULL, - ipr_version = "5.76-107.0", - ipr_dest_dir = "inst/extdata/interpro", - ipr_platform = "linux/amd64", - auto_prepare_data = TRUE, - # Metadata cleaning - ref_file_path = "../data_raw/", - verbose = TRUE) +runDataProcessing( + duckdb_path = "../data/Shigella_flexneri/Sfl.duckdb", + output_path = NULL, + # unified threads for all tools + threads = 16, + # Panaroo + panaroo_split_jobs = FALSE, + panaroo_core_threshold = 0.90, + panaroo_len_dif_percent = 0.95, + panaroo_cluster_threshold = 0.95, + panaroo_family_seq_identity = 0.5, + # CD-HIT + cdhit_identity = 0.9, + cdhit_word_length = 5, + cdhit_memory = 0, + cdhit_extra_args = c("-g", "1"), + cdhit_output_prefix = "cdhit_out", + # InterPro + ipr_appl = c("Pfam"), + ipr_threads_unused = NULL, + ipr_version = "5.76-107.0", + ipr_dest_dir = "inst/extdata/interpro", + ipr_platform = "linux/amd64", + auto_prepare_data = TRUE, + # Metadata cleaning + ref_file_path = "../data_raw/", + verbose = TRUE +) ``` ## Plots @@ -128,8 +137,10 @@ runDataProcessing(duckdb_path = "../data/Shigella_flexneri/Sfl.duckdb", Simple stats and plots to explore metadata ```{r } -generateSummary("data/metadata_parquet", - out_path = "data/") -generatePlots("data/metadata_parquet", - out_path = "data/") +generateSummary("data/metadata_parquet", + out_path = "data/" +) +generatePlots("data/metadata_parquet", + out_path = "data/" +) ```