From 9bcadf4b1de53b91def943e174d93a4ceccfc96b Mon Sep 17 00:00:00 2001
From: Caleb Grenko <33234745+grenkoca@users.noreply.github.com>
Date: Tue, 21 Jun 2022 10:34:55 -0400
Subject: [PATCH 1/7] Added catch for NA values

---
 bin/013-compare_de_results.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bin/013-compare_de_results.py b/bin/013-compare_de_results.py
index 2b7770a..341d4e1 100755
--- a/bin/013-compare_de_results.py
+++ b/bin/013-compare_de_results.py
@@ -203,6 +203,9 @@ def main():
     small_value[filt] = np.nanmin(df['pvalue'][np.invert(filt)])  # ** 1.5
     df['pval_neglog10'] = np.log10(df['pvalue'] + small_value) * -1
     df['pval_signedneglog10'] = df['pval_neglog10'] * np.sign(df['log2fc'])
+    
+    # Drop all rows with nans
+    df.dropna(axis=0, inplace=True)
 
     # For each combination of columns...
     # 1. Plot p-value distribution

From 2054bf96d62f5ff5c9d2ac5f89cb960af8f49ed8 Mon Sep 17 00:00:00 2001
From: grenkoca <grenkoca@gmail.com>
Date: Sun, 11 Sep 2022 15:58:29 -0400
Subject: [PATCH 2/7] Updated to add options for count/cp10k

---
 bin/011-run_deseq.R                   | 45 ++++++++++++++++++---------
 bin/011-run_differential_expression.R | 26 +++++++++++-----
 bin/013-compare_de_results.py         |  2 +-
 main.nf                               |  2 +-
 modules/differential_expression.nf    | 21 +++++++------
 5 files changed, 64 insertions(+), 32 deletions(-)

diff --git a/bin/011-run_deseq.R b/bin/011-run_deseq.R
index 6bacecd..1b40766 100755
--- a/bin/011-run_deseq.R
+++ b/bin/011-run_deseq.R
@@ -61,6 +61,8 @@ DE_calculate_dge <- function(
 
   ## Grab design and remove the `test_var`
   des <- design(dds)
+  
+
   reduced_des <- des[,-which(colnames(des) == coef_value), drop=F]
 
   ## Set parallel cores
@@ -91,20 +93,35 @@ DE_calculate_dge <- function(
 
   ## Defaults are taken from DESeq2 SC recommendations:
   # https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#recommendations-for-single-cell-analysis
-  de_results <- DESeq2::DESeq(
-    dds,
-    test = "LRT",
-    fitType = de_method,
-    sfType = sfType,
-    reduced = reduced_des,
-    quiet = !verbose,
-    minReplicatesForReplace = minReplicatesForReplace,
-    useT = T, ## Shouldn't matter. Only used for Wald
-    minmu = minmu,
-    parallel = par,
-    BPPARAM = BiocParallel::MulticoreParam(n_cores)
-  )
-
+  print("Estimating dispersions")
+  # Added by Caleb, Sept 7 2022
+  dds <- estimateSizeFactors(dds)
+  dds <- estimateDispersionsGeneEst(dds)
+  print("Saving estimates")
+  dispersions(dds) <- mcols(dds)$dispGeneEst
+
+  if (TRUE) {
+	  de_results <- DESeq2::DESeq(
+	    dds,
+	    test = "LRT",
+	    fitType = de_method,
+	    sfType = sfType,
+	    reduced = reduced_des,
+	    quiet = !verbose,
+	    minReplicatesForReplace = minReplicatesForReplace,
+	    useT = T, ## Shouldn't matter. Only used for Wald
+	    minmu = minmu,
+	    parallel = par,
+	    BPPARAM = BiocParallel::MulticoreParam(n_cores)
+	  )
+
+  } else {
+  	de_results <- DESeq2::nbinomLRT(
+	    dds,
+	    reduced = reduced_des,
+	    quiet = !verbose
+	    )
+  }
   # Get results
   rez <- DESeq2::results(
     de_results,
diff --git a/bin/011-run_differential_expression.R b/bin/011-run_differential_expression.R
index fd24c4e..35c6f28 100755
--- a/bin/011-run_differential_expression.R
+++ b/bin/011-run_differential_expression.R
@@ -77,11 +77,16 @@ optionList <- list(
                         help = "Key to use to determine sample source of cells."
   ),
 
-  optparse::make_option(c("-l", "--mean_cp10k_filter"),
+  optparse::make_option(c("-l", "--filter_threshold"),
                         type = "double",
                         default = 1,
-                        help = "Filter to remove genes with fewer cp10k
-                        averages."
+                        help = "Filter to remove genes with fewer cp10k/counts
+                        averages. See option `--filter_type`"
+  ),
+  optparse::make_option(c("-x", "--filter_type"),
+                        type = "character",
+                        default = 1,
+                        help = "Either 'counts' or 'cp10k'"
   ),
 
   optparse::make_option(c("-m", "--pre_filter_genes"),
@@ -553,7 +558,8 @@ get_empty_df <- function() {
 ######################## Read Data & Manipulate ################################
 verbose <- arguments$options$verbose
 output_file_base <- arguments$options$out_file
-mean_filter <- arguments$options$mean_cp10k_filter
+mean_filter <- arguments$options$filter_threshold
+filter_type <- arguments$options$filter_type
 formula_str <- arguments$options$formula
 
 # Read all data in
@@ -1006,12 +1012,18 @@ if (nrow(de_results) > 0) {
 
   # Filter
   if (verbose) {
-    cat(sprintf("Filtering out genes with mean cp10k expression < %s...\n",
+    cat(sprintf("Filtering out genes with mean %s < %s...\n",
+		filter_type,
                 mean_filter))
   }
   n_genes_before <- nrow(de_results)
-  de_results <- de_results[which(de_results$mean_cp10k > mean_filter), ]
-
+  print(filter_type)
+  # Removed because it wasn't working right
+  if (filter_type == "cp10k") {
+  	de_results <- de_results[which(de_results$mean_cp10k > mean_filter), ]
+  } else if (filter_type == "counts") {
+  	de_results <- de_results[which(de_results$mean_counts > mean_filter), ]
+  }
   if (verbose) {
     cat(sprintf("Done. Filtered %s genes.\n",
                 n_genes_before - nrow(de_results)))
diff --git a/bin/013-compare_de_results.py b/bin/013-compare_de_results.py
index 341d4e1..7d4efba 100755
--- a/bin/013-compare_de_results.py
+++ b/bin/013-compare_de_results.py
@@ -64,7 +64,7 @@ def plot_qq(
     # the axes)
     axis_max = max(df['pval_neglog10'])
 
-    if facet_var is None:
+    if facet_var is None or len(df[facet_var].unique()) < 2:
         pvals = df.groupby(by=color_var).apply(
             calculate_expected_pval
         ).reset_index(level=color_var, drop=True)
diff --git a/main.nf b/main.nf
index bcd606f..434c6fc 100644
--- a/main.nf
+++ b/main.nf
@@ -1,6 +1,6 @@
 #!/usr/bin/env nextflow
 
-nextflow.preview.dsl = 2
+nextflow.enable.dsl=2
 
 VERSION = "0.0.1" // Do not edit, controlled by bumpversion.
 
diff --git a/modules/differential_expression.nf b/modules/differential_expression.nf
index 208532a..d829bbd 100644
--- a/modules/differential_expression.nf
+++ b/modules/differential_expression.nf
@@ -55,7 +55,8 @@ process run_differential_expression {
         path(anndata)
         val(cell_label_column)
         val(experiment_id)
-        val(mean_cp10k_filter)
+        val(filter_threshold)
+	val(filter_type)
         each cell_label
         each model
 
@@ -178,7 +179,8 @@ process run_differential_expression {
             --variable_target "${variable_target}" \
             --method "${model.method}" \
             --method_script $baseDir/bin/${method_script} \
-            --mean_cp10k_filter ${mean_cp10k_filter} \
+            --filter_threshold ${filter_threshold} \
+            --filter_type ${filter_type} \
             --ruvseq_n_empirical_genes ${model.ruvseq_n_empirical_genes} \
             --ruvseq_min_pvalue ${model.ruvseq_min_pvalue} \
             --ruvseq_k_factors ${model.ruvseq_k} \
@@ -347,7 +349,7 @@ process merge_de_dataframes {
         """
         echo "merge_de_dataframes: ${process_info}"
         echo "publish_directory: ${outdir}"
-        sleep 5m
+        sleep 15s
         merge_dataframes.py \
             --dataframe_keys '${result_keys}' \
             --dataframe_paths '${result_paths}' \
@@ -778,7 +780,8 @@ workflow wf__differential_expression {
             anndata,
             anndata_cell_label,
             experiment_key,
-            model.mean_cp10k_filter,
+            model.filter_threshold,
+	    model.filter_type,
             // '1',  // just run on first cluster for development
             cell_labels,  // run for all clusters for run time
             model.value
@@ -833,11 +836,11 @@ workflow wf__differential_expression {
             de_results_merged
         )
         // Basic plots of the differential expression results across all models
-        plot_merged_dge(
-            outdir,
-            merge_de_dataframes.out.merged_results,
-            de_plot_config.mean_expression_filter.value
-        )
+        // plot_merged_dge(
+        //    outdir,
+        //    merge_de_dataframes.out.merged_results,
+        //    de_plot_config.mean_expression_filter.value
+        //)
 
         // First run GO Enrich on merged DGE results
         if (goenrich_config.run_process) {

From 173bebd6f5e9445a838c91c0da2c5d8c733b8a2c Mon Sep 17 00:00:00 2001
From: grenkoca <grenkoca@gmail.com>
Date: Sun, 11 Sep 2022 16:03:31 -0400
Subject: [PATCH 3/7] Clarified alternate dispersion estimate fitting method
 (turned off by default, not a flag just in the if (TRUE) block

---
 bin/011-run_deseq.R | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/bin/011-run_deseq.R b/bin/011-run_deseq.R
index 1b40766..91cffa4 100755
--- a/bin/011-run_deseq.R
+++ b/bin/011-run_deseq.R
@@ -93,13 +93,6 @@ DE_calculate_dge <- function(
 
   ## Defaults are taken from DESeq2 SC recommendations:
   # https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#recommendations-for-single-cell-analysis
-  print("Estimating dispersions")
-  # Added by Caleb, Sept 7 2022
-  dds <- estimateSizeFactors(dds)
-  dds <- estimateDispersionsGeneEst(dds)
-  print("Saving estimates")
-  dispersions(dds) <- mcols(dds)$dispGeneEst
-
   if (TRUE) {
 	  de_results <- DESeq2::DESeq(
 	    dds,
@@ -116,6 +109,16 @@ DE_calculate_dge <- function(
 	  )
 
   } else {
+	# Added by Caleb, Sept 7 2022
+	#   This is an alternative curvefitting method in case 
+	#   the dispersion estimates are < 2 orders of magnitude from
+	#   the minimum. (i.e. the target phenotype has a small range)
+	#   Turns out there was an error in my calculation and this isn't
+	#   needed anymore, but I left it in as an option.
+	dds <- estimateSizeFactors(dds)
+	dds <- estimateDispersionsGeneEst(dds)
+	dispersions(dds) <- mcols(dds)$dispGeneEst
+
   	de_results <- DESeq2::nbinomLRT(
 	    dds,
 	    reduced = reduced_des,

From 8387a597486140f110c815a1116ba8a87aa4a0ef Mon Sep 17 00:00:00 2001
From: grenkoca <grenkoca@gmail.com>
Date: Sun, 11 Sep 2022 16:04:47 -0400
Subject: [PATCH 4/7] Re-enabled plotting

---
 modules/differential_expression.nf | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/modules/differential_expression.nf b/modules/differential_expression.nf
index d829bbd..c4e3811 100644
--- a/modules/differential_expression.nf
+++ b/modules/differential_expression.nf
@@ -836,11 +836,11 @@ workflow wf__differential_expression {
             de_results_merged
         )
         // Basic plots of the differential expression results across all models
-        // plot_merged_dge(
-        //    outdir,
-        //    merge_de_dataframes.out.merged_results,
-        //    de_plot_config.mean_expression_filter.value
-        //)
+        plot_merged_dge(
+            outdir,
+            merge_de_dataframes.out.merged_results,
+            de_plot_config.mean_expression_filter.value
+        )
 
         // First run GO Enrich on merged DGE results
         if (goenrich_config.run_process) {

From dcdabf1415d14d3b04a33509f4327be760c450ab Mon Sep 17 00:00:00 2001
From: grenkoca <grenkoca@gmail.com>
Date: Mon, 12 Sep 2022 13:49:15 -0400
Subject: [PATCH 5/7] Added catch for invalid filter type

---
 bin/011-run_differential_expression.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bin/011-run_differential_expression.R b/bin/011-run_differential_expression.R
index 35c6f28..545936f 100755
--- a/bin/011-run_differential_expression.R
+++ b/bin/011-run_differential_expression.R
@@ -1023,6 +1023,8 @@ if (nrow(de_results) > 0) {
   	de_results <- de_results[which(de_results$mean_cp10k > mean_filter), ]
   } else if (filter_type == "counts") {
   	de_results <- de_results[which(de_results$mean_counts > mean_filter), ]
+  } else { 
+	stop(sprintf("Error: invalid argument for `filter_type`: expected 'cp10k' or 'counts', got %s", filter_type))
   }
   if (verbose) {
     cat(sprintf("Done. Filtered %s genes.\n",

From e669c2d1639e6b647bc8e55e65976c396865f074 Mon Sep 17 00:00:00 2001
From: grenkoca <grenkoca@gmail.com>
Date: Thu, 15 Sep 2022 08:45:13 -0400
Subject: [PATCH 6/7] Added version check to see if nextflow has stable release
 of dsl=2

---
 main.nf | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 434c6fc..46006eb 100644
--- a/main.nf
+++ b/main.nf
@@ -1,6 +1,10 @@
 #!/usr/bin/env nextflow
 
-nextflow.enable.dsl=2
+if (nextflow.version.matches('20.07.1+')) {
+    nextflow.enable.dsl=2
+} else {
+    nextflow.preview.dsl=2
+}
 
 VERSION = "0.0.1" // Do not edit, controlled by bumpversion.
 

From 20d17e3590ea15bed57c4bd35c2634d140af220f Mon Sep 17 00:00:00 2001
From: grenkoca <grenkoca@gmail.com>
Date: Thu, 15 Sep 2022 08:49:38 -0400
Subject: [PATCH 7/7] Added version check to see if nextflow has stable release
 of dsl=2

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 46006eb..ac23b41 100644
--- a/main.nf
+++ b/main.nf
@@ -1,6 +1,6 @@
 #!/usr/bin/env nextflow
 
-if (nextflow.version.matches('20.07.1+')) {
+if (nextflow.version.matches('>= 20.07.1')) {
     nextflow.enable.dsl=2
 } else {
     nextflow.preview.dsl=2