From 9bcadf4b1de53b91def943e174d93a4ceccfc96b Mon Sep 17 00:00:00 2001 From: Caleb Grenko <33234745+grenkoca@users.noreply.github.com> Date: Tue, 21 Jun 2022 10:34:55 -0400 Subject: [PATCH 1/7] Added catch for NA values --- bin/013-compare_de_results.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/013-compare_de_results.py b/bin/013-compare_de_results.py index 2b7770a..341d4e1 100755 --- a/bin/013-compare_de_results.py +++ b/bin/013-compare_de_results.py @@ -203,6 +203,9 @@ def main(): small_value[filt] = np.nanmin(df['pvalue'][np.invert(filt)]) # ** 1.5 df['pval_neglog10'] = np.log10(df['pvalue'] + small_value) * -1 df['pval_signedneglog10'] = df['pval_neglog10'] * np.sign(df['log2fc']) + + # Drop all rows with nans + df.dropna(axis=0, inplace=True) # For each combination of columns... # 1. Plot p-value distribution From 2054bf96d62f5ff5c9d2ac5f89cb960af8f49ed8 Mon Sep 17 00:00:00 2001 From: grenkoca Date: Sun, 11 Sep 2022 15:58:29 -0400 Subject: [PATCH 2/7] Updated to add options for count/cp10k --- bin/011-run_deseq.R | 45 ++++++++++++++++++--------- bin/011-run_differential_expression.R | 26 +++++++++++----- bin/013-compare_de_results.py | 2 +- main.nf | 2 +- modules/differential_expression.nf | 21 +++++++------ 5 files changed, 64 insertions(+), 32 deletions(-) diff --git a/bin/011-run_deseq.R b/bin/011-run_deseq.R index 6bacecd..1b40766 100755 --- a/bin/011-run_deseq.R +++ b/bin/011-run_deseq.R @@ -61,6 +61,8 @@ DE_calculate_dge <- function( ## Grab design and remove the `test_var` des <- design(dds) + + reduced_des <- des[,-which(colnames(des) == coef_value), drop=F] ## Set parallel cores @@ -91,20 +93,35 @@ DE_calculate_dge <- function( ## Defaults are taken from DESeq2 SC recommendations: # https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#recommendations-for-single-cell-analysis - de_results <- DESeq2::DESeq( - dds, - test = "LRT", - fitType = de_method, - sfType = sfType, - reduced = reduced_des, - quiet = !verbose, - minReplicatesForReplace = minReplicatesForReplace, - useT = T, ## Shouldn't matter. Only used for Wald - minmu = minmu, - parallel = par, - BPPARAM = BiocParallel::MulticoreParam(n_cores) - ) - + print("Estimating dispersions") + # Added by Caleb, Sept 7 2022 + dds <- estimateSizeFactors(dds) + dds <- estimateDispersionsGeneEst(dds) + print("Saving estimates") + dispersions(dds) <- mcols(dds)$dispGeneEst + + if (TRUE) { + de_results <- DESeq2::DESeq( + dds, + test = "LRT", + fitType = de_method, + sfType = sfType, + reduced = reduced_des, + quiet = !verbose, + minReplicatesForReplace = minReplicatesForReplace, + useT = T, ## Shouldn't matter. Only used for Wald + minmu = minmu, + parallel = par, + BPPARAM = BiocParallel::MulticoreParam(n_cores) + ) + + } else { + de_results <- DESeq2::nbinomLRT( + dds, + reduced = reduced_des, + quiet = !verbose + ) + } # Get results rez <- DESeq2::results( de_results, diff --git a/bin/011-run_differential_expression.R b/bin/011-run_differential_expression.R index fd24c4e..35c6f28 100755 --- a/bin/011-run_differential_expression.R +++ b/bin/011-run_differential_expression.R @@ -77,11 +77,16 @@ optionList <- list( help = "Key to use to determine sample source of cells." ), - optparse::make_option(c("-l", "--mean_cp10k_filter"), + optparse::make_option(c("-l", "--filter_threshold"), type = "double", default = 1, - help = "Filter to remove genes with fewer cp10k - averages." + help = "Filter to remove genes with fewer cp10k/counts + averages. See option `--filter_type`" + ), + optparse::make_option(c("-x", "--filter_type"), + type = "character", + default = 1, + help = "Either 'counts' or 'cp10k'" ), optparse::make_option(c("-m", "--pre_filter_genes"), @@ -553,7 +558,8 @@ get_empty_df <- function() { ######################## Read Data & Manipulate ################################ verbose <- arguments$options$verbose output_file_base <- arguments$options$out_file -mean_filter <- arguments$options$mean_cp10k_filter +mean_filter <- arguments$options$filter_threshold +filter_type <- arguments$options$filter_type formula_str <- arguments$options$formula # Read all data in @@ -1006,12 +1012,18 @@ if (nrow(de_results) > 0) { # Filter if (verbose) { - cat(sprintf("Filtering out genes with mean cp10k expression < %s...\n", + cat(sprintf("Filtering out genes with mean %s < %s...\n", + filter_type, mean_filter)) } n_genes_before <- nrow(de_results) - de_results <- de_results[which(de_results$mean_cp10k > mean_filter), ] - + print(filter_type) + # Removed because it wasn't working right + if (filter_type == "cp10k") { + de_results <- de_results[which(de_results$mean_cp10k > mean_filter), ] + } else if (filter_type == "counts") { + de_results <- de_results[which(de_results$mean_counts > mean_filter), ] + } if (verbose) { cat(sprintf("Done. Filtered %s genes.\n", n_genes_before - nrow(de_results))) diff --git a/bin/013-compare_de_results.py b/bin/013-compare_de_results.py index 341d4e1..7d4efba 100755 --- a/bin/013-compare_de_results.py +++ b/bin/013-compare_de_results.py @@ -64,7 +64,7 @@ def plot_qq( # the axes) axis_max = max(df['pval_neglog10']) - if facet_var is None: + if facet_var is None or len(df[facet_var].unique()) < 2: pvals = df.groupby(by=color_var).apply( calculate_expected_pval ).reset_index(level=color_var, drop=True) diff --git a/main.nf b/main.nf index bcd606f..434c6fc 100644 --- a/main.nf +++ b/main.nf @@ -1,6 +1,6 @@ #!/usr/bin/env nextflow -nextflow.preview.dsl = 2 +nextflow.enable.dsl=2 VERSION = "0.0.1" // Do not edit, controlled by bumpversion. diff --git a/modules/differential_expression.nf b/modules/differential_expression.nf index 208532a..d829bbd 100644 --- a/modules/differential_expression.nf +++ b/modules/differential_expression.nf @@ -55,7 +55,8 @@ process run_differential_expression { path(anndata) val(cell_label_column) val(experiment_id) - val(mean_cp10k_filter) + val(filter_threshold) + val(filter_type) each cell_label each model @@ -178,7 +179,8 @@ process run_differential_expression { --variable_target "${variable_target}" \ --method "${model.method}" \ --method_script $baseDir/bin/${method_script} \ - --mean_cp10k_filter ${mean_cp10k_filter} \ + --filter_threshold ${filter_threshold} \ + --filter_type ${filter_type} \ --ruvseq_n_empirical_genes ${model.ruvseq_n_empirical_genes} \ --ruvseq_min_pvalue ${model.ruvseq_min_pvalue} \ --ruvseq_k_factors ${model.ruvseq_k} \ @@ -347,7 +349,7 @@ process merge_de_dataframes { """ echo "merge_de_dataframes: ${process_info}" echo "publish_directory: ${outdir}" - sleep 5m + sleep 15s merge_dataframes.py \ --dataframe_keys '${result_keys}' \ --dataframe_paths '${result_paths}' \ @@ -778,7 +780,8 @@ workflow wf__differential_expression { anndata, anndata_cell_label, experiment_key, - model.mean_cp10k_filter, + model.filter_threshold, + model.filter_type, // '1', // just run on first cluster for development cell_labels, // run for all clusters for run time model.value @@ -833,11 +836,11 @@ workflow wf__differential_expression { de_results_merged ) // Basic plots of the differential expression results across all models - plot_merged_dge( - outdir, - merge_de_dataframes.out.merged_results, - de_plot_config.mean_expression_filter.value - ) + // plot_merged_dge( + // outdir, + // merge_de_dataframes.out.merged_results, + // de_plot_config.mean_expression_filter.value + //) // First run GO Enrich on merged DGE results if (goenrich_config.run_process) { From 173bebd6f5e9445a838c91c0da2c5d8c733b8a2c Mon Sep 17 00:00:00 2001 From: grenkoca Date: Sun, 11 Sep 2022 16:03:31 -0400 Subject: [PATCH 3/7] Clarified alternate dispersion estimate fitting method (turned off by default, not a flag just in the if (TRUE) block --- bin/011-run_deseq.R | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/bin/011-run_deseq.R b/bin/011-run_deseq.R index 1b40766..91cffa4 100755 --- a/bin/011-run_deseq.R +++ b/bin/011-run_deseq.R @@ -93,13 +93,6 @@ DE_calculate_dge <- function( ## Defaults are taken from DESeq2 SC recommendations: # https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#recommendations-for-single-cell-analysis - print("Estimating dispersions") - # Added by Caleb, Sept 7 2022 - dds <- estimateSizeFactors(dds) - dds <- estimateDispersionsGeneEst(dds) - print("Saving estimates") - dispersions(dds) <- mcols(dds)$dispGeneEst - if (TRUE) { de_results <- DESeq2::DESeq( dds, @@ -116,6 +109,16 @@ DE_calculate_dge <- function( ) } else { + # Added by Caleb, Sept 7 2022 + # This is an alternative curvefitting method in case + # the dispersion estimates are < 2 orders of magnitude from + # the minimum. (i.e. the target phenotype has a small range) + # Turns out there was an error in my calculation and this isn't + # needed anymore, but I left it in as an option. + dds <- estimateSizeFactors(dds) + dds <- estimateDispersionsGeneEst(dds) + dispersions(dds) <- mcols(dds)$dispGeneEst + de_results <- DESeq2::nbinomLRT( dds, reduced = reduced_des, From 8387a597486140f110c815a1116ba8a87aa4a0ef Mon Sep 17 00:00:00 2001 From: grenkoca Date: Sun, 11 Sep 2022 16:04:47 -0400 Subject: [PATCH 4/7] Re-enabled plotting --- modules/differential_expression.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modules/differential_expression.nf b/modules/differential_expression.nf index d829bbd..c4e3811 100644 --- a/modules/differential_expression.nf +++ b/modules/differential_expression.nf @@ -836,11 +836,11 @@ workflow wf__differential_expression { de_results_merged ) // Basic plots of the differential expression results across all models - // plot_merged_dge( - // outdir, - // merge_de_dataframes.out.merged_results, - // de_plot_config.mean_expression_filter.value - //) + plot_merged_dge( + outdir, + merge_de_dataframes.out.merged_results, + de_plot_config.mean_expression_filter.value + ) // First run GO Enrich on merged DGE results if (goenrich_config.run_process) { From dcdabf1415d14d3b04a33509f4327be760c450ab Mon Sep 17 00:00:00 2001 From: grenkoca Date: Mon, 12 Sep 2022 13:49:15 -0400 Subject: [PATCH 5/7] Added catch for invalid filter type --- bin/011-run_differential_expression.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/011-run_differential_expression.R b/bin/011-run_differential_expression.R index 35c6f28..545936f 100755 --- a/bin/011-run_differential_expression.R +++ b/bin/011-run_differential_expression.R @@ -1023,6 +1023,8 @@ if (nrow(de_results) > 0) { de_results <- de_results[which(de_results$mean_cp10k > mean_filter), ] } else if (filter_type == "counts") { de_results <- de_results[which(de_results$mean_counts > mean_filter), ] + } else { + stop(sprintf("Error: invalid argument for `filter_type`: expected 'cp10k' or 'counts', got %s", filter_type)) } if (verbose) { cat(sprintf("Done. Filtered %s genes.\n", From e669c2d1639e6b647bc8e55e65976c396865f074 Mon Sep 17 00:00:00 2001 From: grenkoca Date: Thu, 15 Sep 2022 08:45:13 -0400 Subject: [PATCH 6/7] Added version check to see if nextflow has stable release of dsl=2 --- main.nf | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 434c6fc..46006eb 100644 --- a/main.nf +++ b/main.nf @@ -1,6 +1,10 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 +if (nextflow.version.matches('20.07.1+')) { + nextflow.enable.dsl=2 +} else { + nextflow.preview.dsl=2 +} VERSION = "0.0.1" // Do not edit, controlled by bumpversion. From 20d17e3590ea15bed57c4bd35c2634d140af220f Mon Sep 17 00:00:00 2001 From: grenkoca Date: Thu, 15 Sep 2022 08:49:38 -0400 Subject: [PATCH 7/7] Added version check to see if nextflow has stable release of dsl=2 --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 46006eb..ac23b41 100644 --- a/main.nf +++ b/main.nf @@ -1,6 +1,6 @@ #!/usr/bin/env nextflow -if (nextflow.version.matches('20.07.1+')) { +if (nextflow.version.matches('>= 20.07.1')) { nextflow.enable.dsl=2 } else { nextflow.preview.dsl=2