remove partition info; create workflow-specific profile; bump to snak…

…emake 8
epigen · Sep 11, 2024 · 749c9a8 · 749c9a8
1 parent c0f0124
commit 749c9a8
Show file tree

Hide file tree

Showing 9 changed files with 15 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -81,7 +81,7 @@ Interactive visualizations in self-contained HTML files of all 2D and 3D project
 **Leiden Clustering**
 We applied the Leiden algorithm (ver) [ref] to the UMAP KNN graphs specified by the respective parameters (metric, n_neighbors). The adjacency matrix of the KNN graph was converted to a weighted undirected graph using igraph (ver) [ref]. The Leiden algorithm was then applied to this graph, using the specified partition type [partition_types], resolution [resolutions], and number of iterations [n_iterations]. All clustering results were visualized as described above as 2D and interactive 2D and 3D plots for all available embedings/projections.
 
-**Clustification Approach**
+**Clustification Approach (beta)**
 We developed/employed an iterative clustering approach, termed Clustification, that merges clusters based on misclassification. The method was initialized with the clustering result that had the highest resolution (i.e., the most clusters). We then performed iterative classification using the cluster labels, to determine if the classifier can distinguish between clusters or if they should be merged. This involved a stratified 5-fold cross-validation and a Random Forest classifier with default parameters (e.g., 100 trees). The predicted labels were retained for each iteration. Clusters were merged based on a normalized confusion matrix built using the predicted labels. This matrix was made symmetric and upper triangular, resulting in a similarity graph, such that each edge weight ranges from 0 to 1, where 0 means that the classifier was able to distinguish all observations between the two respective clusters. The stopping criterion was set such that if the maximum edge weight was less than 2.5% (i.e., 0.025 – less than 5% of observations are misclassified between any two clusters), the process would stop and return the current cluster labels. Otherwise, the two clusters connected by the maximum edge weight were merged. This process was repeated until the stopping criterion was met.
 
 **Clustree Analysis & Visualization**
@@ -136,7 +136,7 @@ The workflow perfroms the following analyses on each dataset provided in the ann
     - Leiden algorithm
         - Applied to the UMAP KNN graphs specified by the respective parameters (metric, n_neighbors).
         - All algorithm specific parameters are supported: [partition_types], [resolutions], and [n_iterations].
-    - Clustification: an ML-based clustering approach that iteratively merges clusters based on misclassification
+    - Clustification: an ML-based clustering approach that iteratively merges clusters based on misclassification (beta)
         0. User: Specify a clustering method [method].
         1. Chose the clustering with the most clusters as starting point (i.e., overclustered).
         2. Iterative classification using the cluster labels, to determine if the classifier can distinguish between clusters or if they should be merged.

diff --git a/config/config.yaml b/config/config.yaml
@@ -4,7 +4,6 @@
 # memory in MB
 mem: '32000'
 threads: 2
-partition: 'shortq'
 
 ##### GENERAL #####
 annotation: test/config/digits_unsupervised_analysis_annotation.csv
@@ -29,7 +28,7 @@ umap:
     metrics: ['euclidean']
     n_neighbors: [15]
     min_dist: [0.1]
-    n_components: [2,3]
+    n_components: [2]
     densmap: 1
     connectivity: 1
     diagnostics: 1
@@ -50,7 +49,7 @@ heatmap:
 # Leiden algorithm specific parameters (partition_types, resolutions, n_iterations) -> https://leidenalg.readthedocs.io/en/stable/index.html
 # To skip Leiden clustering, leave the "metrics" parameter empty i.e., []
 leiden:
-    metrics: ['euclidean'] # has to be a subset of above's UMAP parameters
+    metrics: [] # has to be a subset of above's UMAP parameters
     n_neighbors: [15] # has to be a subset of above's UMAP parameters
     partition_types: ["RBConfigurationVertexPartition", "ModularityVertexPartition"]
     resolutions: [0.5,1,1.5,2,4] # only used for relevant partition_types
@@ -60,7 +59,7 @@ leiden:
 # ML-based clustering approach that iteratively merges clusters based on misclassification.
 # Doesn't support externally provided clusterings.
 clustification:
-    method: "Leiden" # starting clustering result method, options: "Leiden"
+    method: "" # starting clustering result method, options: "Leiden"
 
 ##### clustree #####
 # Cluster analysis and visualization using clustree: https://lazappi.github.io/clustree/index.html

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -5,7 +5,7 @@ import pandas as pd
 import yaml
 from snakemake.utils import min_version
 
-min_version("7.15.2")
+min_version("8.20.1")
 
 ##### module name #####
 module_name = "unsupervised_analysis"
@@ -169,8 +169,7 @@ rule all:
     threads: config.get("threads", 1)
     log:
         os.path.join("logs","rules","all.log"),
-    params:
-        partition=config.get("partition"),
+
 
 
 ##### load rules #####

diff --git a/workflow/profiles/default/config.yaml b/workflow/profiles/default/config.yaml
@@ -0,0 +1,3 @@
+default-resources:
+    slurm_partition: shortq
+    slurm_extra: "'--qos=shortq'"
diff --git a/workflow/rules/cluster_validation.smk b/workflow/rules/cluster_validation.smk
@@ -24,7 +24,6 @@ rule clustree_analysis:
     log:
         os.path.join("logs","rules","clustree_{sample}_{content}.log"),
     params:
-        partition=config.get("partition"),
         content = lambda w: "{}".format(w.content),
         count_filter = config["clustree"]["count_filter"],
         prop_filter = config["clustree"]["prop_filter"],
@@ -61,7 +60,6 @@ rule clustree_analysis_metadata:
     log:
         os.path.join("logs","rules","clustree_{sample}_{content}.log"),
     params:
-        partition=config.get("partition"),
         content = lambda w: "{}".format(w.content),
         count_filter = config["clustree"]["count_filter"],
         prop_filter = config["clustree"]["prop_filter"],
@@ -85,8 +83,6 @@ rule validation_external:
         "../envs/umap_leiden.yaml"
     log:
         os.path.join("logs","rules","validation_external_{sample}.log"),
-    params:
-        partition=config.get("partition"),
     script:
         "../scripts/validation_external.py"
 
@@ -104,7 +100,6 @@ rule validation_internal:
     log:
         os.path.join("logs","rules","validation_internal_{internal_index}_{sample}.log"),
     params:
-        partition=config.get("partition"),
         internal_index = lambda w: "{}".format(w.internal_index),
         sample_proportion = config["sample_proportion"],
         metadata_of_interest = config["metadata_of_interest"],
@@ -124,8 +119,6 @@ rule aggregate_rank_internal:
         "../envs/pymcdm.yaml"
     log:
         os.path.join("logs","rules","rank_internal_{sample}.log"),
-    params:
-        partition=config.get("partition"),
     script:
         "../scripts/mcdm_topsis.py"
 
@@ -155,7 +148,5 @@ rule plot_indices:
         "../envs/ggplot.yaml"
     log:
         os.path.join("logs","rules","plot_{type}_indices_{sample}.log"),
-    params:
-        partition=config.get("partition"),
     script:
         "../scripts/plot_indices.R"
diff --git a/workflow/rules/clustering.smk b/workflow/rules/clustering.smk
@@ -12,7 +12,6 @@ rule leiden_cluster:
     log:
         os.path.join("logs","rules","leiden_{sample}_{metric}_{n_neighbors}_{partition_type}_{resolution}_clustering.log"),
     params:
-        partition=config.get("partition"),
         samples_by_features = get_data_orientation,
         metric = lambda w: "{}".format(w.metric),
         n_neighbors = lambda w: "{}".format(w.n_neighbors),
@@ -30,13 +29,12 @@ rule clustification:
         clustering = os.path.join(config["result_path"],'unsupervised_analysis','{sample}','clustification','clustification_clusterings.csv'),
     resources:
         mem_mb=config.get("mem", "16000"),
-    threads: 8#config.get("threads", 1)
+    threads: 8 #config.get("threads", 1)
     conda:
         "../envs/umap_leiden.yaml"
     log:
         os.path.join("logs","rules","clustification_{sample}_clusterings.log"),
     params:
-        partition=config.get("partition"),
         samples_by_features = get_data_orientation,
     script:
         "../scripts/clustification.py"
@@ -47,10 +45,10 @@ rule aggregate_clustering_results:
         get_clustering_paths,
     output:
         aggregated_clusterings = os.path.join(config["result_path"],'unsupervised_analysis','{sample}','{method}','{method}_clusterings.csv'),
+    resources:
+        mem_mb=config.get("mem", "16000"),
     log:
         os.path.join("logs","rules","aggregate_clustering_results_{sample}_{method}.log"),
-    params:
-        partition=config.get("partition"),
     run:
         # list to hold the individual clusterings
         agg_clust = []
@@ -72,10 +70,10 @@ rule aggregate_all_clustering_results:
         get_aggregated_clustering_paths,
     output:
         metadata_clusterings = os.path.join(config["result_path"],'unsupervised_analysis','{sample}','metadata_clusterings.csv'),
+    resources:
+        mem_mb=config.get("mem", "16000"),
     log:
         os.path.join("logs","rules","aggregate_all_clustering_results_{sample}.log"),
-    params:
-        partition=config.get("partition"),
     run:
         # list to hold the data
         agg_clust = []

diff --git a/workflow/rules/dimred.smk b/workflow/rules/dimred.smk
@@ -18,7 +18,6 @@ rule pca:
     log:
         os.path.join("logs","rules","PCA_{sample}_{parameters}.log"),
     params:
-        partition = config.get("partition"),
         samples_by_features = get_data_orientation,
     script:
         "../scripts/pca.py"
@@ -40,7 +39,6 @@ rule umap_graph:
     log:
         os.path.join("logs","rules","umap_{sample}_{metric}_{n_neighbors}.log"),
     params:
-        partition=config.get("partition"),
         samples_by_features = get_data_orientation,
         metric = lambda w: "{}".format(w.metric),
         n_neighbors = lambda w: "{}".format(w.n_neighbors),
@@ -64,7 +62,6 @@ rule umap_embed:
     log:
         os.path.join("logs","rules","umap_{sample}_{metric}_{n_neighbors}_{min_dist}_{n_components}.log"),
     params:
-        partition=config.get("partition"),
         samples_by_features = get_data_orientation,
         metric = lambda w: "{}".format(w.metric),
         n_neighbors = lambda w: "{}".format(w.n_neighbors),
@@ -90,7 +87,6 @@ rule densmap_embed:
     log:
         os.path.join("logs","rules","densmap_{sample}_{metric}_{n_neighbors}_{min_dist}_{n_components}.log"),
     params:
-        partition=config.get("partition"),
         samples_by_features = get_data_orientation,
         metric = lambda w: "{}".format(w.metric),
         n_neighbors = lambda w: "{}".format(w.n_neighbors),
@@ -115,7 +111,6 @@ rule distance_matrix:
     log:
         os.path.join("logs","rules","DistanceMatrix_{sample}_{metric}_{type}.log"),
     params:
-        partition = config.get("partition"),
         samples_by_features = get_data_orientation,
     script:
         "../scripts/distance_matrix.py"
diff --git a/workflow/rules/envs_export.smk b/workflow/rules/envs_export.smk
@@ -13,8 +13,6 @@ rule env_export:
     threads: config.get("threads", 1)
     log:
         os.path.join("logs","rules","env_{env}.log"),
-    params:
-        partition=config.get("partition"),
     shell:
         """
         conda env export > {output}
@@ -33,8 +31,6 @@ rule config_export:
     threads: config.get("threads", 1)
     log:
         os.path.join("logs","rules","config_export.log"),
-    params:
-        partition=config.get("partition"),
     run:
         with open(output["configs"], 'w') as outfile:
             yaml.dump(config, outfile)
@@ -54,8 +50,6 @@ rule annot_export:
     threads: config.get("threads", 1)
     log:
         os.path.join("logs","rules","annot_export.log"),
-    params:
-        partition=config.get("partition"),
     shell:
         """
         cp {input} {output}

diff --git a/workflow/rules/visualization.smk b/workflow/rules/visualization.smk
@@ -14,7 +14,6 @@ rule prep_feature_plot:
     log:
         os.path.join("logs","rules","prep_feature_plot_{sample}.log"),
     params:
-        partition = config.get("partition"),
         samples_by_features = get_data_orientation,
         features_to_plot = config["features_to_plot"],
     script:
@@ -48,7 +47,6 @@ rule plot_dimred_features:
     params:
         size = config["scatterplot2d"]["size"],
         alpha = config["scatterplot2d"]["alpha"],
-        partition=config.get("partition"),
     script:
         "../scripts/plot_2d.R"
 
@@ -82,7 +80,6 @@ rule plot_dimred_metadata:
     params:
         size = config["scatterplot2d"]["size"],
         alpha = config["scatterplot2d"]["alpha"],
-        partition=config.get("partition"),
     script:
         "../scripts/plot_2d.R"
 
@@ -144,8 +141,6 @@ rule plot_pca_diagnostics:
         "../envs/ggplot.yaml"
     log:
         os.path.join("logs","rules","plot_{method}_diagnostics_{sample}_{parameters}.log"),
-    params:
-        partition=config.get("partition"),
     script:
         "../scripts/plot_pca.R"
 
@@ -173,8 +168,6 @@ rule plot_umap_diagnostics:
         "../envs/umap_leiden.yaml"
     log:
         os.path.join("logs","rules","plot_diagnostics_{sample}_{method}_{parameters}.log"),
-    params:
-        partition=config.get("partition"),
     script:
         "../scripts/plot_umap_diagnostics.py"
 
@@ -202,8 +195,6 @@ rule plot_umap_connectivity:
         "../envs/umap_leiden.yaml"
     log:
         os.path.join("logs","rules","plot_connectivity_{sample}_{method}_{parameters}.log"),
-    params:
-        partition=config.get("partition"),
     script:
         "../scripts/plot_umap_connectivity.py"
 
@@ -223,7 +214,6 @@ rule plot_dimred_interactive:
     log:
         os.path.join("logs","rules","plot_interactive_{sample}_{method}_{parameters}_{n_components}.log"),
     params:
-        partition=config.get("partition"),
         n_components = lambda w: "{}".format(w.n_components),
         size = config["scatterplot2d"]["size"],
         alpha = config["scatterplot2d"]["alpha"]
@@ -256,7 +246,6 @@ rule plot_heatmap:
     log:
         os.path.join("logs","rules","plot_heatmap_{sample}_{metric}_{method}.log"),
     params:
-        partition = config.get("partition"),
         samples_by_features = get_data_orientation,
     script:
         "../scripts/plot_heatmap.R"
@@ -290,7 +279,6 @@ rule plot_dimred_clustering:
     log:
         os.path.join("logs","rules","plot_clustering_{sample}_{method}_{parameters}_{n_components}.log"),
     params:
-        partition=config.get("partition"),
         size = config["scatterplot2d"]["size"],
         alpha = config["scatterplot2d"]["alpha"]
     script: