From 46cf6a7afb7c661ddecbbfb313413b6757d5d01e Mon Sep 17 00:00:00 2001 From: Phlya Date: Fri, 10 Feb 2023 16:35:40 +0100 Subject: [PATCH] Allow no annotations; some input validation --- config/config.yml | 8 +++-- config/samples_annotations.tsv | 2 +- workflow/Snakefile | 61 ++++++++++++++++++++++++---------- 3 files changed, 50 insertions(+), 21 deletions(-) diff --git a/config/config.yml b/config/config.yml index 595b566..ac9a195 100644 --- a/config/config.yml +++ b/config/config.yml @@ -18,10 +18,14 @@ fields_to_match: null # comparisons fields_to_differ: cell_type: hESCs + # Annotations file with two columns: annotation name ("bedname") and "file" (URLs or local file) -# Downloaded bed files will be stored in beds_folder +# Downloaded bed files will be stored in beds_folder. +# Provide nothing here if you don't want to use any annotations. annotations: config/annotations.tsv + # Rules about correpondence between samples and annotations can be specified here +# Provide nothing here if all annotations apply to all samples. samples_annotations_combinations: config/samples_annotations.tsv # folder definition is optional @@ -113,7 +117,7 @@ pileups: local_rescaled: '--local --rescale --rescale_pad 1' by_strand_by_distance: "--by_strand --by_distance" by_strand_local: "--by_strand --local" - by_strand_distal: "--by_strand --maxdist 1000000" + by_strand_distal: "--by_strand --subset 10000 --maxdist 1000000" by_window_short_range: "--by_window --subset 1000 --maxdist 2000000" by_window_long_range: "--by_window --subset 1000 --mindist 2000000" diff --git a/config/samples_annotations.tsv b/config/samples_annotations.tsv index a8d2efd..7bc5d92 100644 --- a/config/samples_annotations.tsv +++ b/config/samples_annotations.tsv @@ -1,2 +1,2 @@ sample CTCF_test_bed ENCODE_CREs_HFF_bed -test_cool x x +test_cool_HFF x x diff --git a/workflow/Snakefile b/workflow/Snakefile index 0a7feba..6907460 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -78,9 +78,23 @@ def will_download(link): return False if parsed_path.scheme == "" else True -samples_df = pd.read_csv( - config["samples"], sep="\t", header=0, comment="#", dtype={"do_dots": bool} -) +try: + samples_df = pd.read_csv( + config["samples"], + sep="\t", + header=0, + comment="#", + dtype={"do_dots": bool, "do_tads": bool}, + ) +except: + raise ValueError("Could not read file with samples, please ensure it exists") +if "sample" not in samples_df.columns: + raise ValueError( + 'Column "sample" has to be in the file with description of samples' + ) +if "file" not in samples_df.columns: + raise ValueError('Column "file" has to be in the file with description of samples') + samples_df = samples_df.fillna(value=False) samples_df.loc[:, "will_download"] = samples_df.file.apply(will_download) samples_df.loc[:, "local_path"] = samples_df.apply( @@ -140,14 +154,20 @@ local_bedpe_names = { for bedpefile in bedpefiles_local } -bed_df = pd.read_csv(config["annotations"], sep="\t", header=0, comment="#") -bed_df.loc[:, "will_download"] = bed_df.file.apply(will_download) -bed_df.loc[:, "local_path"] = bed_df.apply( - lambda x: make_local_path(x.bedname, x.format) if x.will_download else x.file, - axis=1, - result_type="reduce", -) -bed_df = bed_df.set_index("bedname").replace("-", np.nan) +if config["annotations"]: + bed_df = pd.read_csv(config["annotations"], sep="\t", header=0, comment="#") + bed_df.loc[:, "will_download"] = bed_df.file.apply(will_download) + bed_df.loc[:, "local_path"] = bed_df.apply( + lambda x: make_local_path(x.bedname, x.format) if x.will_download else x.file, + axis=1, + result_type="reduce", + ) + bed_df = bed_df.set_index("bedname").replace("-", np.nan) +else: + bed_df = pd.DataFrame( + columns=["bedname", "file", "format", "will_download", "local_path"] + + list(config["pileups"]["arguments"]) + ) pileup_params = config["pileups"]["arguments"] @@ -164,13 +184,18 @@ bedfiles = list(bedfiles_dict.keys()) bedtype_dict = dict(bed_df["format"]) # bedpe_pileups_mindist, bedpe_pileups_maxdist = config['bedpe_pileups_distance_limits'] -samples_annotations = ~pd.read_csv( - config["samples_annotations_combinations"], - sep="\t", - header=0, - index_col=0, - comment="#", -).isna() +if config["samples_annotations_combinations"]: + samples_annotations = ~pd.read_csv( + config["samples_annotations_combinations"], + sep="\t", + header=0, + index_col=0, + comment="#", + ).isna() +else: + samples_annotations = pd.DataFrame( + np.ones((len(samples), len(bedfiles))), index=samples, columns=bedfiles + ).astype(bool) ### Data resolutions if config["eigenvector"]["do"]: