From 46cf6a7afb7c661ddecbbfb313413b6757d5d01e Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Fri, 10 Feb 2023 16:35:40 +0100
Subject: [PATCH] Allow no annotations; some input validation

---
 config/config.yml              |  8 +++--
 config/samples_annotations.tsv |  2 +-
 workflow/Snakefile             | 61 ++++++++++++++++++++++++----------
 3 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/config/config.yml b/config/config.yml
index 595b566..ac9a195 100644
--- a/config/config.yml
+++ b/config/config.yml
@@ -18,10 +18,14 @@ fields_to_match: null
 # comparisons
 fields_to_differ:
     cell_type: hESCs
+
 # Annotations file with two columns: annotation name ("bedname") and "file" (URLs or local file)
-# Downloaded bed files will be stored in beds_folder
+# Downloaded bed files will be stored in beds_folder.
+# Provide nothing here if you don't want to use any annotations.
 annotations: config/annotations.tsv
+
 # Rules about correpondence between samples and annotations can be specified here
+# Provide nothing here if all annotations apply to all samples.
 samples_annotations_combinations: config/samples_annotations.tsv
 
 # folder definition is optional
@@ -113,7 +117,7 @@ pileups:
         local_rescaled: '--local --rescale --rescale_pad 1'
         by_strand_by_distance: "--by_strand --by_distance"
         by_strand_local: "--by_strand --local"
-        by_strand_distal: "--by_strand --maxdist 1000000"
+        by_strand_distal: "--by_strand --subset 10000 --maxdist 1000000"
         by_window_short_range: "--by_window --subset 1000 --maxdist 2000000"
         by_window_long_range:  "--by_window --subset 1000 --mindist 2000000"
 
diff --git a/config/samples_annotations.tsv b/config/samples_annotations.tsv
index a8d2efd..7bc5d92 100644
--- a/config/samples_annotations.tsv
+++ b/config/samples_annotations.tsv
@@ -1,2 +1,2 @@
 sample	CTCF_test_bed	ENCODE_CREs_HFF_bed
-test_cool	x	x
+test_cool_HFF	x	x
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 0a7feba..6907460 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -78,9 +78,23 @@ def will_download(link):
     return False if parsed_path.scheme == "" else True
 
 
-samples_df = pd.read_csv(
-    config["samples"], sep="\t", header=0, comment="#", dtype={"do_dots": bool}
-)
+try:
+    samples_df = pd.read_csv(
+        config["samples"],
+        sep="\t",
+        header=0,
+        comment="#",
+        dtype={"do_dots": bool, "do_tads": bool},
+    )
+except:
+    raise ValueError("Could not read file with samples, please ensure it exists")
+if "sample" not in samples_df.columns:
+    raise ValueError(
+        'Column "sample" has to be in the file with description of samples'
+    )
+if "file" not in samples_df.columns:
+    raise ValueError('Column "file" has to be in the file with description of samples')
+
 samples_df = samples_df.fillna(value=False)
 samples_df.loc[:, "will_download"] = samples_df.file.apply(will_download)
 samples_df.loc[:, "local_path"] = samples_df.apply(
@@ -140,14 +154,20 @@ local_bedpe_names = {
     for bedpefile in bedpefiles_local
 }
 
-bed_df = pd.read_csv(config["annotations"], sep="\t", header=0, comment="#")
-bed_df.loc[:, "will_download"] = bed_df.file.apply(will_download)
-bed_df.loc[:, "local_path"] = bed_df.apply(
-    lambda x: make_local_path(x.bedname, x.format) if x.will_download else x.file,
-    axis=1,
-    result_type="reduce",
-)
-bed_df = bed_df.set_index("bedname").replace("-", np.nan)
+if config["annotations"]:
+    bed_df = pd.read_csv(config["annotations"], sep="\t", header=0, comment="#")
+    bed_df.loc[:, "will_download"] = bed_df.file.apply(will_download)
+    bed_df.loc[:, "local_path"] = bed_df.apply(
+        lambda x: make_local_path(x.bedname, x.format) if x.will_download else x.file,
+        axis=1,
+        result_type="reduce",
+    )
+    bed_df = bed_df.set_index("bedname").replace("-", np.nan)
+else:
+    bed_df = pd.DataFrame(
+        columns=["bedname", "file", "format", "will_download", "local_path"]
+        + list(config["pileups"]["arguments"])
+    )
 
 pileup_params = config["pileups"]["arguments"]
 
@@ -164,13 +184,18 @@ bedfiles = list(bedfiles_dict.keys())
 bedtype_dict = dict(bed_df["format"])
 # bedpe_pileups_mindist, bedpe_pileups_maxdist = config['bedpe_pileups_distance_limits']
 
-samples_annotations = ~pd.read_csv(
-    config["samples_annotations_combinations"],
-    sep="\t",
-    header=0,
-    index_col=0,
-    comment="#",
-).isna()
+if config["samples_annotations_combinations"]:
+    samples_annotations = ~pd.read_csv(
+        config["samples_annotations_combinations"],
+        sep="\t",
+        header=0,
+        index_col=0,
+        comment="#",
+    ).isna()
+else:
+    samples_annotations = pd.DataFrame(
+        np.ones((len(samples), len(bedfiles))), index=samples, columns=bedfiles
+    ).astype(bool)
 
 ### Data resolutions
 if config["eigenvector"]["do"]: