CollinsLabBioComp · grenkoca · Jun 21, 2022 · Sep 6, 2022 · Sep 11, 2022 · Sep 11, 2022
diff --git a/bin/011-run_deseq.R b/bin/011-run_deseq.R
@@ -61,6 +61,8 @@ DE_calculate_dge <- function(
 
   ## Grab design and remove the `test_var`
   des <- design(dds)
+
+
   reduced_des <- des[,-which(colnames(des) == coef_value), drop=F]
 
   ## Set parallel cores
@@ -91,20 +93,38 @@ DE_calculate_dge <- function(
 
   ## Defaults are taken from DESeq2 SC recommendations:
   # https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#recommendations-for-single-cell-analysis
-  de_results <- DESeq2::DESeq(
-    dds,
-    test = "LRT",
-    fitType = de_method,
-    sfType = sfType,
-    reduced = reduced_des,
-    quiet = !verbose,
-    minReplicatesForReplace = minReplicatesForReplace,
-    useT = T, ## Shouldn't matter. Only used for Wald
-    minmu = minmu,
-    parallel = par,
-    BPPARAM = BiocParallel::MulticoreParam(n_cores)
-  )
-
+  if (TRUE) {
+	  de_results <- DESeq2::DESeq(
+	    dds,
+	    test = "LRT",
+	    fitType = de_method,
+	    sfType = sfType,
+	    reduced = reduced_des,
+	    quiet = !verbose,
+	    minReplicatesForReplace = minReplicatesForReplace,
+	    useT = T, ## Shouldn't matter. Only used for Wald
+	    minmu = minmu,
+	    parallel = par,
+	    BPPARAM = BiocParallel::MulticoreParam(n_cores)
+	  )
+
+  } else {
+	# Added by Caleb, Sept 7 2022
+	#   This is an alternative curvefitting method in case 
+	#   the dispersion estimates are < 2 orders of magnitude from
+	#   the minimum. (i.e. the target phenotype has a small range)
+	#   Turns out there was an error in my calculation and this isn't
+	#   needed anymore, but I left it in as an option.
+	dds <- estimateSizeFactors(dds)
+	dds <- estimateDispersionsGeneEst(dds)
+	dispersions(dds) <- mcols(dds)$dispGeneEst
+
+  	de_results <- DESeq2::nbinomLRT(
+	    dds,
+	    reduced = reduced_des,
+	    quiet = !verbose
+	    )
+  }
   # Get results
   rez <- DESeq2::results(
     de_results,

diff --git a/bin/011-run_differential_expression.R b/bin/011-run_differential_expression.R
@@ -77,11 +77,16 @@ optionList <- list(
                         help = "Key to use to determine sample source of cells."
   ),
 
-  optparse::make_option(c("-l", "--mean_cp10k_filter"),
+  optparse::make_option(c("-l", "--filter_threshold"),
                         type = "double",
                         default = 1,
-                        help = "Filter to remove genes with fewer cp10k
-                        averages."
+                        help = "Filter to remove genes with fewer cp10k/counts
+                        averages. See option `--filter_type`"
+  ),
+  optparse::make_option(c("-x", "--filter_type"),
+                        type = "character",
+                        default = 1,
+                        help = "Either 'counts' or 'cp10k'"
   ),
 
   optparse::make_option(c("-m", "--pre_filter_genes"),
@@ -553,7 +558,8 @@ get_empty_df <- function() {
 ######################## Read Data & Manipulate ################################
 verbose <- arguments$options$verbose
 output_file_base <- arguments$options$out_file
-mean_filter <- arguments$options$mean_cp10k_filter
+mean_filter <- arguments$options$filter_threshold
+filter_type <- arguments$options$filter_type
 formula_str <- arguments$options$formula
 
 # Read all data in
@@ -1006,12 +1012,20 @@ if (nrow(de_results) > 0) {
 
   # Filter
   if (verbose) {
-    cat(sprintf("Filtering out genes with mean cp10k expression < %s...\n",
+    cat(sprintf("Filtering out genes with mean %s < %s...\n",
+		filter_type,
                 mean_filter))
   }
   n_genes_before <- nrow(de_results)
-  de_results <- de_results[which(de_results$mean_cp10k > mean_filter), ]
-
+  print(filter_type)
+  # Removed because it wasn't working right
+  if (filter_type == "cp10k") {
+  	de_results <- de_results[which(de_results$mean_cp10k > mean_filter), ]
+  } else if (filter_type == "counts") {
+  	de_results <- de_results[which(de_results$mean_counts > mean_filter), ]
+  } else { 
+	stop(sprintf("Error: invalid argument for `filter_type`: expected 'cp10k' or 'counts', got %s", filter_type))
+  }
   if (verbose) {
     cat(sprintf("Done. Filtered %s genes.\n",
                 n_genes_before - nrow(de_results)))

diff --git a/bin/013-compare_de_results.py b/bin/013-compare_de_results.py
@@ -64,7 +64,7 @@ def plot_qq(
     # the axes)
     axis_max = max(df['pval_neglog10'])
 
-    if facet_var is None:
+    if facet_var is None or len(df[facet_var].unique()) < 2:
         pvals = df.groupby(by=color_var).apply(
             calculate_expected_pval
         ).reset_index(level=color_var, drop=True)
@@ -203,6 +203,9 @@ def main():
     small_value[filt] = np.nanmin(df['pvalue'][np.invert(filt)])  # ** 1.5
     df['pval_neglog10'] = np.log10(df['pvalue'] + small_value) * -1
     df['pval_signedneglog10'] = df['pval_neglog10'] * np.sign(df['log2fc'])
+
+    # Drop all rows with nans
+    df.dropna(axis=0, inplace=True)
 
     # For each combination of columns...
     # 1. Plot p-value distribution

diff --git a/main.nf b/main.nf
@@ -1,6 +1,10 @@
 #!/usr/bin/env nextflow
 
-nextflow.preview.dsl = 2
+if (nextflow.version.matches('>= 20.07.1')) {
+    nextflow.enable.dsl=2
+} else {
+    nextflow.preview.dsl=2
+}
 
 VERSION = "0.0.1" // Do not edit, controlled by bumpversion.
 

diff --git a/modules/differential_expression.nf b/modules/differential_expression.nf
@@ -55,7 +55,8 @@ process run_differential_expression {
         path(anndata)
         val(cell_label_column)
         val(experiment_id)
-        val(mean_cp10k_filter)
+        val(filter_threshold)
+	val(filter_type)
         each cell_label
         each model
 
@@ -178,7 +179,8 @@ process run_differential_expression {
             --variable_target "${variable_target}" \
             --method "${model.method}" \
             --method_script $baseDir/bin/${method_script} \
-            --mean_cp10k_filter ${mean_cp10k_filter} \
+            --filter_threshold ${filter_threshold} \
+            --filter_type ${filter_type} \
             --ruvseq_n_empirical_genes ${model.ruvseq_n_empirical_genes} \
             --ruvseq_min_pvalue ${model.ruvseq_min_pvalue} \
             --ruvseq_k_factors ${model.ruvseq_k} \
@@ -347,7 +349,7 @@ process merge_de_dataframes {
         """
         echo "merge_de_dataframes: ${process_info}"
         echo "publish_directory: ${outdir}"
-        sleep 5m
+        sleep 15s
         merge_dataframes.py \
             --dataframe_keys '${result_keys}' \
             --dataframe_paths '${result_paths}' \
@@ -778,7 +780,8 @@ workflow wf__differential_expression {
             anndata,
             anndata_cell_label,
             experiment_key,
-            model.mean_cp10k_filter,
+            model.filter_threshold,
+	    model.filter_type,
             // '1',  // just run on first cluster for development
             cell_labels,  // run for all clusters for run time
             model.value