From 05480c2a15a9928f1a13a4e7469332aab6cf8d12 Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Mon, 10 Jul 2023 16:56:56 -0400 Subject: [PATCH 1/4] Allow unauthenticated s3 --- bin/utils/common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/utils/common.py b/bin/utils/common.py index 538861e..0f9e093 100755 --- a/bin/utils/common.py +++ b/bin/utils/common.py @@ -6,6 +6,8 @@ import pandas as pd import bbi import boto3 +from botocore import UNSIGNED +from botocore.config import Config import os @@ -13,7 +15,8 @@ def s3_to_local(s3_path, local_path): """ Convert s3 paths to local file paths for pybbi to process. """ - s3 = boto3.resource('s3') + # s3 = boto3.resource('s3') + s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) path_parts = s3_path.replace("s3://","").split("/") bucket=path_parts.pop(0) @@ -23,7 +26,8 @@ def s3_to_local(s3_path, local_path): os.makedirs(local_path) filename = key.rsplit('/', 1)[-1] - s3.Object(bucket, key).download_file(local_path + filename) + # s3.Object(bucket, key).download_file(local_path + filename) + s3.download_file(bucket, key, local_path + filename) local_file = f"{local_path}{filename}" return local_file From 75dc9c4a8ec4e6dc317055388a928c9484596d72 Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Mon, 10 Jul 2023 19:47:33 -0400 Subject: [PATCH 2/4] Fix: deprecated np.float -> np.float64 --- bin/utils/eigdecomp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/utils/eigdecomp.py b/bin/utils/eigdecomp.py index 50fe5e4..dfc2fc0 100644 --- a/bin/utils/eigdecomp.py +++ b/bin/utils/eigdecomp.py @@ -237,7 +237,7 @@ def eig_trans( # We actually extract n + 1 eigenvectors. # The first eigenvector, E0, will be uniform with eigenvalue 1. mask = np.sum(np.abs(A), axis=0) != 0 - A_collapsed = A[mask, :][:, mask].astype(np.float, copy=True) + A_collapsed = A[mask, :][:, mask].astype(np.float64, copy=True) eigvals, eigvecs_collapsed = scipy.sparse.linalg.eigsh( A_collapsed, n_eigs + 1, @@ -479,7 +479,7 @@ def _each(region): if (A.shape[0] <= ignore_diags + n_eigs + 1) or (mask.sum() <= ignore_diags + n_eigs + 1): return _region, np.nan * np.ones(n_eigs+1), np.nan * np.ones((len(A), n_eigs+1)) - A_collapsed = A[mask, :][:, mask].astype(np.float, copy=True) + A_collapsed = A[mask, :][:, mask].astype(np.float64, copy=True) eigvals, eigvecs_collapsed = scipy.sparse.linalg.eigsh( A_collapsed, n_eigs + 1, From 272a97448f4aba475276db80ebc9b3918b92899b Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Mon, 10 Jul 2023 19:53:22 -0400 Subject: [PATCH 3/4] add pre-commit config --- .pre-commit-config.yaml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3d5cdf7 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,32 @@ +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: debug-statements + - id: name-tests-test + - id: requirements-txt-fixer +- repo: https://github.com/asottile/setup-cfg-fmt + rev: v2.4.0 + hooks: + - id: setup-cfg-fmt +- repo: https://github.com/PyCQA/flake8 + rev: 6.0.0 + hooks: + - id: flake8 +- repo: https://github.com/PyCQA/isort + rev: 5.12.0 + hooks: + - id: isort +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.4.1 + hooks: + - id: mypy + additional_dependencies: [types-all] + exclude: ^testing/resources/ +- repo: https://github.com/psf/black + rev: 23.3.0 + hooks: + - id: black From e18891c03ae97a3092d441f9ea1c7f3c9b4518a9 Mon Sep 17 00:00:00 2001 From: Thomas Reimonn Date: Mon, 10 Jul 2023 20:06:00 -0400 Subject: [PATCH 4/4] autoflake --- bin/clustering_main.py | 46 +++---- bin/eigdecomp_main.py | 59 ++++----- bin/heatmap.py | 70 +++++------ bin/make_bintable.py | 65 ++++------ bin/make_multivec.py | 30 ++--- bin/make_track_db.py | 91 ++++++-------- bin/plot_spectrum.py | 27 ++-- bin/scatters.py | 50 ++++---- bin/utils/clustering.py | 257 +++++++++++++++++++++++++++------------ bin/utils/common.py | 57 ++++----- bin/utils/df2multivec.py | 41 ++----- bin/utils/eigdecomp.py | 127 +++++++++---------- bin/utils/plotting.py | 209 +++++++++++++------------------ 13 files changed, 543 insertions(+), 586 deletions(-) diff --git a/bin/clustering_main.py b/bin/clustering_main.py index 86ec8f0..b7f7e37 100755 --- a/bin/clustering_main.py +++ b/bin/clustering_main.py @@ -1,36 +1,28 @@ #!/usr/bin/env python3 -import matplotlib as mpl -mpl.use("Agg") -import matplotlib.pyplot as plt +import matplotlib as mpl -from functools import partial -import os.path as op -import pathlib +mpl.use("Agg") +import argparse -from tqdm import tqdm import bioframe -import cooler -import h5py import numpy as np import pandas as pd -import argparse -import yaml - import utils.clustering as cluster +import yaml parser = argparse.ArgumentParser() -parser.add_argument('--config') -parser.add_argument('--eigvals', help='parquet file with eigenvalues', required=True) -parser.add_argument('--eigvecs', help='parquet file with eigenvectors', required=True) -parser.add_argument('--bins', help='parquet bins file', required=True) +parser.add_argument("--config") +parser.add_argument("--eigvals", help="parquet file with eigenvalues", required=True) +parser.add_argument("--eigvecs", help="parquet file with eigenvectors", required=True) +parser.add_argument("--bins", help="parquet bins file", required=True) args = parser.parse_args() with open(args.config, "r") as infile: config = yaml.full_load(infile) - + assembly = config["assembly"] n_clusters = config["n_clusters"] binsize = config["binsize"] @@ -38,8 +30,8 @@ n_eigs = config["n_eigs"] CHROMSIZES = bioframe.fetch_chromsizes(assembly) -CHROMOSOMES = list(CHROMSIZES[:'chrY'].index) -CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:'chr22'].index) +CHROMOSOMES = list(CHROMSIZES[:"chrY"].index) +CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:"chr22"].index) try: CENTROMERES = bioframe.fetch_centromeres(assembly) @@ -54,21 +46,20 @@ cluster_sort_key = "GC" eigvecs = pd.read_parquet(args.eigvecs) -eigvals = pd.read_parquet(args.eigvals).set_index('eig') -eigvecs = eigvecs[eigvecs['chrom'].isin(chromosomes)] +eigvals = pd.read_parquet(args.eigvals).set_index("eig") +eigvecs = eigvecs[eigvecs["chrom"].isin(chromosomes)] # Use as many eigenvectors as initial positive eigenvalues n_components = np.where(eigvals < 0)[0][0] - 1 print(f"Using {n_components} components for clustering...") sorting_tracks = pd.read_parquet(args.bins) -sorting_tracks = sorting_tracks[sorting_tracks['chrom'].isin(chromosomes)] +sorting_tracks = sorting_tracks[sorting_tracks["chrom"].isin(chromosomes)] -out = eigvecs[['chrom', 'start', 'end']].copy() +out = eigvecs[["chrom", "start", "end"]].copy() for n_cluster in n_clusters: - - colname = f'kmeans_sm{n_cluster}' + colname = f"kmeans_sm{n_cluster}" labels = cluster.kmeans_sm( eigvals, @@ -85,7 +76,6 @@ ) out[colname] = new_labels - out[colname + '_order'] = bin_ranks - -out.to_csv(f"{sample}.{binsize}.E1-E{n_eigs}.kmeans_sm.tsv", sep='\t', index=False) + out[colname + "_order"] = bin_ranks +out.to_csv(f"{sample}.{binsize}.E1-E{n_eigs}.kmeans_sm.tsv", sep="\t", index=False) diff --git a/bin/eigdecomp_main.py b/bin/eigdecomp_main.py index 5951d22..29a16e2 100755 --- a/bin/eigdecomp_main.py +++ b/bin/eigdecomp_main.py @@ -1,36 +1,28 @@ #!/usr/bin/env python3 - -import matplotlib as mpl -mpl.use("Agg") -import matplotlib.pyplot as plt -from functools import partial -import os.path as op -import pathlib +import matplotlib as mpl + +mpl.use("Agg") +import argparse -from tqdm import tqdm import bioframe import cooler -import h5py import numpy as np import pandas as pd -import argparse -import yaml - -import utils.common as common import utils.eigdecomp as eig +import yaml parser = argparse.ArgumentParser() -parser.add_argument('--config') -parser.add_argument('--bins', help='parquet bins file', required=True) -parser.add_argument('--blacklist', help='blacklist file path') -parser.add_argument('--cooler', help='cooler file path', required=True) +parser.add_argument("--config") +parser.add_argument("--bins", help="parquet bins file", required=True) +parser.add_argument("--blacklist", help="blacklist file path") +parser.add_argument("--cooler", help="cooler file path", required=True) args = parser.parse_args() with open(args.config, "r") as infile: config = yaml.full_load(infile) - + assembly = config["assembly"] binsize = config["binsize"] sample = config["sample"] @@ -38,8 +30,8 @@ decomp_mode = config["decomp_mode"] CHROMSIZES = bioframe.fetch_chromsizes(assembly) -CHROMOSOMES = list(CHROMSIZES[:'chrY'].index) -CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:'chr22'].index) +CHROMOSOMES = list(CHROMSIZES[:"chrY"].index) +CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:"chr22"].index) try: CENTROMERES = bioframe.fetch_centromeres(assembly) @@ -50,29 +42,23 @@ # has a header (chrom, start, end, GC) ref_track = pd.read_parquet(args.bins) -ref_track = ref_track[ref_track['chrom'].isin(chromosomes)] +ref_track = ref_track[ref_track["chrom"].isin(chromosomes)] # include blacklist if args.blacklist is not None: # no header - blacklist = pd.read_csv( - args.blacklist, - sep='\t', - names=['chrom', 'start', 'end'] + blacklist = pd.read_csv(args.blacklist, sep="\t", names=["chrom", "start", "end"]) + ref_track = bioframe.count_overlaps(ref_track, blacklist).rename( + columns={"count": "is_bad"} ) - ref_track = ( - bioframe.count_overlaps(ref_track, blacklist) - .rename(columns={'count': 'is_bad'}) - ) -ref_track = ref_track[ref_track['chrom'].isin(chromosomes)] +ref_track = ref_track[ref_track["chrom"].isin(chromosomes)] path = args.cooler clr = cooler.Cooler(f"{path}::resolutions/{binsize}") -if decomp_mode=="trans": +if decomp_mode == "trans": partition = np.r_[ - [clr.offset(chrom) for chrom in chromosomes], - clr.extent(chromosomes[-1])[1] + [clr.offset(chrom) for chrom in chromosomes], clr.extent(chromosomes[-1])[1] ] eigval_df, eigvec_df = eig.eig_trans( @@ -84,7 +70,7 @@ corr_metric=None, ) -elif decomp_mode=="cis": +elif decomp_mode == "cis": viewframe_path = (args.assembly).get("viewframe_cis", None) if viewframe_path is None: CHROMARMS = bioframe.make_chromarms(CHROMSIZES, CENTROMERES) @@ -98,8 +84,8 @@ phasing_track_col="GC", n_eigs=n_eigs, corr_metric=None, - ignore_diags=None, # will be inferred from cooler - view_df=viewframe + ignore_diags=None, # will be inferred from cooler + view_df=viewframe, ) else: raise ValueError(f"Mode {decomp_mode} is not implemented") @@ -107,4 +93,3 @@ # Output eigval_df.to_parquet(f"{sample}.{binsize}.E0-E{n_eigs}.{decomp_mode}.eigvals.pq") eigvec_df.to_parquet(f"{sample}.{binsize}.E0-E{n_eigs}.{decomp_mode}.eigvecs.pq") - diff --git a/bin/heatmap.py b/bin/heatmap.py index 25396ab..9dda91f 100755 --- a/bin/heatmap.py +++ b/bin/heatmap.py @@ -1,32 +1,27 @@ #!/usr/bin/env python3 - -import matplotlib as mpl -mpl.use("Agg") -import matplotlib.pyplot as plt -from functools import partial +import matplotlib as mpl + +mpl.use("Agg") +import argparse import os.path as op -import pathlib -from tqdm import tqdm import bioframe -import cooler import h5py +import matplotlib.pyplot as plt import numpy as np import pandas as pd -import argparse -import yaml - import utils.plotting as plotting +import yaml parser = argparse.ArgumentParser() -parser.add_argument('--config') -parser.add_argument('--eigvals', help='parquet file with eigenvalues', required=True) -parser.add_argument('--eigvecs', help='parquet file with eigenvectors', required=True) -parser.add_argument('--bins', help='parquet bins file', required=True) -parser.add_argument('--cluster', help='tsv file with k-means clusters', required=True) -parser.add_argument('--track_db', help='track_db file', required=True) -parser.add_argument('--meta', help='bigwig metadata file', required=True) +parser.add_argument("--config") +parser.add_argument("--eigvals", help="parquet file with eigenvalues", required=True) +parser.add_argument("--eigvecs", help="parquet file with eigenvectors", required=True) +parser.add_argument("--bins", help="parquet bins file", required=True) +parser.add_argument("--cluster", help="tsv file with k-means clusters", required=True) +parser.add_argument("--track_db", help="track_db file", required=True) +parser.add_argument("--meta", help="bigwig metadata file", required=True) args = parser.parse_args() @@ -39,8 +34,8 @@ n_eigs = config["n_eigs"] CHROMSIZES = bioframe.fetch_chromsizes(assembly) -CHROMOSOMES = list(CHROMSIZES[:'chrY'].index) -CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:'chr22'].index) +CHROMOSOMES = list(CHROMSIZES[:"chrY"].index) +CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:"chr22"].index) try: CENTROMERES = bioframe.fetch_centromeres(assembly) @@ -48,47 +43,48 @@ CENTROMERES = None chromosomes = CHROMOSOMES_FOR_CLUSTERING -sort_by = 'centel' -norm = 'sqrt' +sort_by = "centel" +norm = "sqrt" n_eigs_heatmap = 10 n_clusters = config["n_clusters"] eigvecs = pd.read_parquet(args.eigvecs) -eigvals = pd.read_parquet(args.eigvals).set_index('eig')['val'] -sqrt_lam = np.sqrt(np.abs(eigvals.loc['E1':f'E{n_eigs_heatmap}'].to_numpy())) -if norm == 'sqrt': - eigvecs.loc[:, 'E1':f'E{n_eigs_heatmap}'] *= sqrt_lam[np.newaxis, :] -eigvecs = eigvecs[eigvecs['chrom'].isin(chromosomes)].copy() +eigvals = pd.read_parquet(args.eigvals).set_index("eig")["val"] +sqrt_lam = np.sqrt(np.abs(eigvals.loc["E1":f"E{n_eigs_heatmap}"].to_numpy())) +if norm == "sqrt": + eigvecs.loc[:, "E1":f"E{n_eigs_heatmap}"] *= sqrt_lam[np.newaxis, :] +eigvecs = eigvecs[eigvecs["chrom"].isin(chromosomes)].copy() bins = pd.read_parquet(args.bins) clusters = pd.read_table(args.cluster) for clus in n_clusters: - bins["cluster"] = clusters[f'kmeans_sm{clus}'] + bins["cluster"] = clusters[f"kmeans_sm{clus}"] track_db_path = args.track_db if op.exists(track_db_path): meta = pd.read_table(args.meta).set_index("Name") - with h5py.File(track_db_path, 'r') as db: + with h5py.File(track_db_path, "r") as db: for group in config["scatter_groups"].values(): for track_name in group: if track_name not in bins.columns: uid = meta["ID"].get(track_name, track_name) bins[track_name] = db[uid][:] - bins = bins[bins['chrom'].isin(chromosomes)].copy() + bins = bins[bins["chrom"].isin(chromosomes)].copy() - if sort_by == 'centel': - idx = np.lexsort([ - bins['centel_abs'].values, bins['cluster'].values - ]) + if sort_by == "centel": + idx = np.lexsort([bins["centel_abs"].values, bins["cluster"].values]) else: raise ValueError(sort_by) plotting.plot_heatmap( idx, - eigvecs.loc[:, 'E1':f'E{n_eigs_heatmap}'], + eigvecs.loc[:, "E1":f"E{n_eigs_heatmap}"], bins, - trackconfs= config["tracks"], + trackconfs=config["tracks"], blocks=config["heatmap_groups"], coarse_factor=32, ) - plt.savefig(f"{sample}.{binsize}.E1-E{n_eigs}.kmeansm{clus}.heatmap.pdf", bbox_inches='tight') + plt.savefig( + f"{sample}.{binsize}.E1-E{n_eigs}.kmeansm{clus}.heatmap.pdf", + bbox_inches="tight", + ) diff --git a/bin/make_bintable.py b/bin/make_bintable.py index b4a4e74..a6123f5 100755 --- a/bin/make_bintable.py +++ b/bin/make_bintable.py @@ -1,27 +1,20 @@ #!/usr/bin/env python3 -import matplotlib as mpl -mpl.use("Agg") -import matplotlib.pyplot as plt +import matplotlib as mpl +mpl.use("Agg") +import argparse from functools import partial -import os.path as op -import pathlib -from tqdm import tqdm import bioframe -import cooler -import h5py import numpy as np import pandas as pd -import argparse -import yaml - import utils.common as common +import yaml parser = argparse.ArgumentParser() -parser.add_argument('--config') -parser.add_argument('--genome', help='path to fasta file of assembly', required=True) +parser.add_argument("--config") +parser.add_argument("--genome", help="path to fasta file of assembly", required=True) args = parser.parse_args() @@ -32,8 +25,8 @@ binsize = config["binsize"] CHROMSIZES = bioframe.fetch_chromsizes(assembly) -CHROMOSOMES = list(CHROMSIZES[:'chrY'].index) -CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:'chr22'].index) +CHROMOSOMES = list(CHROMSIZES[:"chrY"].index) +CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:"chr22"].index) try: CENTROMERES = bioframe.fetch_centromeres(assembly) @@ -42,40 +35,34 @@ if CENTROMERES is None or len(CENTROMERES) == 0: mids = {chrom: 0 for chrom in CHROMOSOMES} - arms = pd.DataFrame({ - "chrom": CHROMSIZES.index, - "start": 0, - "end": CHROMSIZES.values, - "name": CHROMSIZES.index, - }) - + arms = pd.DataFrame( + { + "chrom": CHROMSIZES.index, + "start": 0, + "end": CHROMSIZES.values, + "name": CHROMSIZES.index, + } + ) + else: - mids = CENTROMERES.set_index('chrom')['mid'] + mids = CENTROMERES.set_index("chrom")["mid"] arms = common.make_chromarms(CHROMSIZES, mids, binsize) -arms.to_csv( - f"{assembly}.chromarms.{binsize}.bed", - sep='\t', - index=False, - header=False -) +arms.to_csv(f"{assembly}.chromarms.{binsize}.bed", sep="\t", index=False, header=False) fa_records = bioframe.load_fasta(args.genome) df = bioframe.binnify(CHROMSIZES, binsize) df = bioframe.frac_gc(df, fa_records) df = common.assign_arms(df, arms) armlens = ( - arms - .assign(length=arms['end'] - arms['start']) - .set_index('name')['length'] + arms.assign(length=arms["end"] - arms["start"]) + .set_index("name")["length"] .to_dict() ) -df['armlen'] = df['arm'].apply(armlens.get) -df['centel'] = ( - df - .groupby('arm', sort=False) - .apply(partial(common.assign_centel, arms=arms.set_index('name'))) +df["armlen"] = df["arm"].apply(armlens.get) +df["centel"] = ( + df.groupby("arm", sort=False) + .apply(partial(common.assign_centel, arms=arms.set_index("name"))) .reset_index(drop=True) ) -df['centel_abs'] = np.round(df['centel'] * df['armlen']).astype(int) +df["centel_abs"] = np.round(df["centel"] * df["armlen"]).astype(int) df.to_parquet(f"{assembly}.bins.gc.{binsize}.pq") - diff --git a/bin/make_multivec.py b/bin/make_multivec.py index b07c460..66b75ab 100755 --- a/bin/make_multivec.py +++ b/bin/make_multivec.py @@ -1,34 +1,26 @@ #!/usr/bin/env python3 -import matplotlib as mpl -mpl.use("Agg") -import matplotlib.pyplot as plt +import matplotlib as mpl -from functools import partial -import os.path as op -import pathlib +mpl.use("Agg") +import argparse -from tqdm import tqdm import bioframe -import cooler -import h5py import numpy as np import pandas as pd -import argparse -import yaml - import utils.df2multivec as mv +import yaml parser = argparse.ArgumentParser() -parser.add_argument('--config') -parser.add_argument('--eigvals', help='parquet file with eigenvalues', required=True) -parser.add_argument('--eigvecs', help='parquet file with eigenvectors', required=True) +parser.add_argument("--config") +parser.add_argument("--eigvals", help="parquet file with eigenvalues", required=True) +parser.add_argument("--eigvecs", help="parquet file with eigenvectors", required=True) args = parser.parse_args() with open(args.config, "r") as infile: config = yaml.full_load(infile) - + assembly = config["assembly"] binsize = config["binsize"] sample = config["sample"] @@ -38,16 +30,16 @@ CHROMSIZES = bioframe.fetch_chromsizes(assembly) -eigvals = pd.read_parquet(args.eigvals).set_index('eig')['val'] +eigvals = pd.read_parquet(args.eigvals).set_index("eig")["val"] eigvecs = pd.read_parquet(args.eigvecs) sqrt_lam = np.sqrt(np.abs(eigvals.to_numpy())) -eigvecs.loc[:, 'E0':] = (eigvecs.loc[:, 'E0':] * sqrt_lam[np.newaxis, :]) +eigvecs.loc[:, "E0":] = eigvecs.loc[:, "E0":] * sqrt_lam[np.newaxis, :] mv.to_multivec( multivec, eigvecs, - [f'E{i}' for i in range(1, n_eigs_multivec)], + [f"E{i}" for i in range(1, n_eigs_multivec)], base_res=binsize, chromsizes=CHROMSIZES, ) diff --git a/bin/make_track_db.py b/bin/make_track_db.py index a2e6b20..e096106 100755 --- a/bin/make_track_db.py +++ b/bin/make_track_db.py @@ -1,106 +1,85 @@ #!/usr/bin/env python3 -import matplotlib as mpl -mpl.use("Agg") -import matplotlib.pyplot as plt +import matplotlib as mpl -from functools import partial +mpl.use("Agg") +import argparse import os.path as op -import pathlib -from tqdm import tqdm -from loky import get_reusable_executor import bioframe -import cooler import h5py -import numpy as np import pandas as pd -import argparse -import yaml - import utils.common as common +import yaml parser = argparse.ArgumentParser() -parser.add_argument('--config') -parser.add_argument('--bins', help='parquet bins file', required=True) -parser.add_argument('--meta', help='metadata file', required=True) +parser.add_argument("--config") +parser.add_argument("--bins", help="parquet bins file", required=True) +parser.add_argument("--meta", help="metadata file", required=True) args = parser.parse_args() with open(args.config, "r") as infile: config = yaml.full_load(infile) - + assembly = config["assembly"] binsize = config["binsize"] track_db = f"tracks.{assembly}.{binsize}.h5" local_path = "local_track_files/" CHROMSIZES = bioframe.fetch_chromsizes(assembly) -CHROMOSOMES = list(CHROMSIZES[:'chrY'].index) -CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:'chr22'].index) +CHROMOSOMES = list(CHROMSIZES[:"chrY"].index) +CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:"chr22"].index) try: CENTROMERES = bioframe.fetch_centromeres(assembly) except ValueError: CENTROMERES = None -h5opts = dict(compression='gzip', compression_opts=6) +h5opts = dict(compression="gzip", compression_opts=6) bins = pd.read_parquet(args.bins) if not op.exists(track_db): - with h5py.File(track_db, 'w') as f: - for col in [ - 'chrom', - 'start', - 'end', - 'GC', - 'armlen', - 'centel', - 'centel_abs' - ]: + with h5py.File(track_db, "w") as f: + for col in ["chrom", "start", "end", "GC", "armlen", "centel", "centel_abs"]: f.create_dataset(col, data=bins[col].values, **h5opts) meta = pd.read_table(args.meta) -paths = meta.set_index('ID')['Path'] -with h5py.File(track_db, 'a') as f: +paths = meta.set_index("ID")["Path"] +with h5py.File(track_db, "a") as f: for ix, row in meta.iterrows(): - if row['ID'] in f: + if row["ID"] in f: continue - if row['FileFormat'].lower() == 'bigwig': - acc = row['ID'] + if row["FileFormat"].lower() == "bigwig": + acc = row["ID"] x = common.fetch_binned( - paths[acc], - CHROMSIZES, - local_path, - CHROMOSOMES, - binsize, - map - ) + paths[acc], CHROMSIZES, local_path, CHROMOSOMES, binsize, map + ) f.create_dataset(acc, data=x, **h5opts) - elif row['FileFormat'].lower() == 'bedgraph': - acc = row['ID'] - df = bioframe.read_table(paths[acc], schema='bedGraph') + elif row["FileFormat"].lower() == "bedgraph": + acc = row["ID"] + df = bioframe.read_table(paths[acc], schema="bedGraph") ov = bioframe.overlap( bins, df, - how='left', + how="left", return_overlap=True, keep_order=True, - suffixes=('', '_') + suffixes=("", "_"), ) - ov['overlap'] = ov['overlap_end'] - ov['overlap_start'] - ov['score'] = ov['value_'] * ov['overlap'] - out = ov.groupby(['chrom', 'start', 'end'], sort=False).agg(**{ - 'score': ('score', 'sum') - }).reset_index() - out['score'] /= (out['end'] - out['start']) - x = out['score'].values + ov["overlap"] = ov["overlap_end"] - ov["overlap_start"] + ov["score"] = ov["value_"] * ov["overlap"] + out = ( + ov.groupby(["chrom", "start", "end"], sort=False) + .agg(**{"score": ("score", "sum")}) + .reset_index() + ) + out["score"] /= out["end"] - out["start"] + x = out["score"].values f.create_dataset(acc, data=x, **h5opts) else: - raise ValueError(row['FileFormat']) - - + raise ValueError(row["FileFormat"]) diff --git a/bin/plot_spectrum.py b/bin/plot_spectrum.py index a5b7f6b..609334f 100755 --- a/bin/plot_spectrum.py +++ b/bin/plot_spectrum.py @@ -1,27 +1,18 @@ #!/usr/bin/env python3 -import matplotlib as mpl -mpl.use("Agg") -import matplotlib.pyplot as plt - -from functools import partial -import os.path as op -import pathlib +import matplotlib as mpl -from tqdm import tqdm -import bioframe -import cooler -import h5py -import numpy as np -import pandas as pd +mpl.use("Agg") import argparse -import yaml +import matplotlib.pyplot as plt +import pandas as pd import utils.plotting as plotting +import yaml parser = argparse.ArgumentParser() -parser.add_argument('--config') -parser.add_argument('--eigvals', help='parquet file with eigenvalues', required=True) +parser.add_argument("--config") +parser.add_argument("--eigvals", help="parquet file with eigenvalues", required=True) args = parser.parse_args() @@ -39,8 +30,8 @@ plotting.plot_spectrum( eigval_df, n_eigs_display=min(32, n_eigs), - title= sample + "." + str(binsize), - outpath=eig_pdf + title=sample + "." + str(binsize), + outpath=eig_pdf, ) plt.savefig(eig_pdf) diff --git a/bin/scatters.py b/bin/scatters.py index 52824f5..637d692 100755 --- a/bin/scatters.py +++ b/bin/scatters.py @@ -1,33 +1,26 @@ #!/usr/bin/env python3 -import matplotlib as mpl -mpl.use("Agg") -import matplotlib.pyplot as plt +import matplotlib as mpl -from functools import partial +mpl.use("Agg") +import argparse import os.path as op -import pathlib -from tqdm import tqdm import bioframe -import cooler import h5py -import numpy as np +import matplotlib.pyplot as plt import pandas as pd -import argparse -import yaml - import utils.plotting as plotting -import utils.clustering as cluster +import yaml parser = argparse.ArgumentParser() -parser.add_argument('--config') -parser.add_argument('--eigvals', help='parquet file with eigenvalues', required=True) -parser.add_argument('--eigvecs', help='parquet file with eigenvectors', required=True) -parser.add_argument('--bins', help='parquet bins file', required=True) -parser.add_argument('--clusters', help='cluster name', required=True) -parser.add_argument('--track_db', help='track_db file', required=True) -parser.add_argument('--meta', help='bigwig metadata file', required=True) +parser.add_argument("--config") +parser.add_argument("--eigvals", help="parquet file with eigenvalues", required=True) +parser.add_argument("--eigvecs", help="parquet file with eigenvectors", required=True) +parser.add_argument("--bins", help="parquet bins file", required=True) +parser.add_argument("--clusters", help="cluster name", required=True) +parser.add_argument("--track_db", help="track_db file", required=True) +parser.add_argument("--meta", help="bigwig metadata file", required=True) args = parser.parse_args() @@ -40,8 +33,8 @@ n_eigs = config["n_eigs"] CHROMSIZES = bioframe.fetch_chromsizes(assembly) -CHROMOSOMES = list(CHROMSIZES[:'chrY'].index) -CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:'chr22'].index) +CHROMOSOMES = list(CHROMSIZES[:"chrY"].index) +CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:"chr22"].index) try: CENTROMERES = bioframe.fetch_centromeres(assembly) @@ -52,8 +45,8 @@ chromosomes = CHROMOSOMES_FOR_CLUSTERING eigvecs = pd.read_parquet(args.eigvecs) -eigvecs = eigvecs[eigvecs['chrom'].isin(chromosomes)].copy() -eigvals = pd.read_parquet(args.eigvals).set_index('eig')['val'] +eigvecs = eigvecs[eigvecs["chrom"].isin(chromosomes)].copy() +eigvals = pd.read_parquet(args.eigvals).set_index("eig")["val"] # sqrt_lam = np.sqrt(np.abs(eigvals.to_numpy())) # eigvecs.loc[:, 'E0':] = ( # eigvecs.loc[:, 'E0':] * sqrt_lam[np.newaxis, :] @@ -63,17 +56,17 @@ clusters = pd.read_table(args.clusters) for clus in n_clusters: - bins["cluster"] = clusters[f'kmeans_sm{clus}'] + bins["cluster"] = clusters[f"kmeans_sm{clus}"] track_db_path = args.track_db if op.exists(track_db_path): meta = pd.read_table(args.meta).set_index("Name") - with h5py.File(track_db_path, 'r') as db: + with h5py.File(track_db_path, "r") as db: for group in config["scatter_groups"].values(): for track_name in group: if track_name not in bins.columns: uid = meta["ID"].get(track_name, track_name) bins[track_name] = db[uid][:] - bins = bins[bins['chrom'].isin(chromosomes)].copy() + bins = bins[bins["chrom"].isin(chromosomes)].copy() plotting.plot_scatters( eigvecs, @@ -81,4 +74,7 @@ trackconfs=config["tracks"], panels=config["scatter_groups"], ) - plt.savefig(f"{sample}.{binsize}.E1-E{n_eigs}.kmeansm{clus}.scatter.pdf", bbox_inches='tight') + plt.savefig( + f"{sample}.{binsize}.E1-E{n_eigs}.kmeansm{clus}.scatter.pdf", + bbox_inches="tight", + ) diff --git a/bin/utils/clustering.py b/bin/utils/clustering.py index 1dea4ea..efc399c 100644 --- a/bin/utils/clustering.py +++ b/bin/utils/clustering.py @@ -1,26 +1,26 @@ #!/usr/bin/env python3 import warnings -import bioframe + import numpy as np import pandas as pd def _extract_eigs( - eigvals, - eigvecs, - n_clusters, - n_components, - weight_by_eigval, - keep_first, - positive_eigs=False, - filter_nans=True + eigvals, + eigvecs, + n_clusters, + n_components, + weight_by_eigval, + keep_first, + positive_eigs=False, + filter_nans=True, ): eigvecs = eigvecs.copy() # Decide whether to use unit normed vectors or to weight them by sqrt(|lambda_i|) if weight_by_eigval: - eigvecs.loc[:, 'E0':] *= np.sqrt(np.abs(eigvals.T.values)) + eigvecs.loc[:, "E0":] *= np.sqrt(np.abs(eigvals.T.values)) # Do the k-clustering on top k eigenvectors, unless overriden to use more or fewer if n_components is None: @@ -29,14 +29,14 @@ def _extract_eigs( if not positive_eigs: # Decide whether to use E0 or not if keep_first: - elo, ehi = 'E0', f'E{n_components - 1}' + elo, ehi = "E0", f"E{n_components - 1}" else: - elo, ehi = 'E1', f'E{n_components}' + elo, ehi = "E1", f"E{n_components}" X = eigvecs.loc[:, elo:ehi].values else: if not keep_first: - eigvals = eigvals.drop('E0') - which = eigvals.loc[eigvals['val'] > 0].index[:n_components] + eigvals = eigvals.drop("E0") + which = eigvals.loc[eigvals["val"] > 0].index[:n_components] X = eigvecs.loc[:, which].values if not filter_nans: @@ -57,45 +57,41 @@ def relabel_clusters(labels, n_clusters, sorting_tracks, sort_key): 3. Length of corresponding chromosome arm. """ # Assign the cluster IDs and extra data to temporary dataframe - df = sorting_tracks[['chrom', 'start', 'end', sort_key, 'centel', 'armlen']].copy() - df['centel_abs'] = df['centel'] * df['armlen'] - df['cluster'] = labels + df = sorting_tracks[["chrom", "start", "end", sort_key, "centel", "armlen"]].copy() + df["centel_abs"] = df["centel"] * df["armlen"] + df["cluster"] = labels # Relabel the clusters using median of sorting column - df.loc[df['cluster'] == n_clusters, sort_key] = np.inf + df.loc[df["cluster"] == n_clusters, sort_key] = np.inf clusters_ordered = ( - df - .groupby('cluster') - [sort_key] - .median() - .sort_values() - .index - .tolist() + df.groupby("cluster")[sort_key].median().sort_values().index.tolist() ) cluster_dtype = pd.CategoricalDtype(clusters_ordered, ordered=True) - df['cluster_relabeled'] = df['cluster'].astype(cluster_dtype).cat.codes + df["cluster_relabeled"] = df["cluster"].astype(cluster_dtype).cat.codes # Reorder the bins for plotting - bin_ranks = ( - df - .sort_values( - ['cluster_relabeled', 'centel_abs'], - ascending=[True, True] - ) - .index - .values - ) - return df['cluster_relabeled'].values, bin_ranks - - -def kmeans_sm(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval=False, keep_first=True, positive_eigs=False): + bin_ranks = df.sort_values( + ["cluster_relabeled", "centel_abs"], ascending=[True, True] + ).index.values + return df["cluster_relabeled"].values, bin_ranks + + +def kmeans_sm( + eigvals, + eigvecs, + n_clusters, + n_components=None, + weight_by_eigval=False, + keep_first=True, + positive_eigs=False, +): """ Shi and Malik (2000) * Use the eigenvectors of L_rw (this doesn't matter for us). * NO row normalization before clustering. Notes ----- - This is what sklearn spectral_clustering does on the eigenvectors of the + This is what sklearn spectral_clustering does on the eigenvectors of the normalized laplacian (when using the k-means method). Sklearn's implementation does not unit norm the input vectors as some do. """ @@ -103,7 +99,7 @@ def kmeans_sm(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval= model = KMeans( n_clusters=n_clusters, - init='k-means++', + init="k-means++", n_init=100, max_iter=10000, tol=0.00001, @@ -116,7 +112,13 @@ def kmeans_sm(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval= ) x, mask = _extract_eigs( - eigvals, eigvecs, n_clusters, n_components, weight_by_eigval, keep_first, positive_eigs + eigvals, + eigvecs, + n_clusters, + n_components, + weight_by_eigval, + keep_first, + positive_eigs, ) labels = np.full(len(mask), n_clusters) labels[mask] = model.fit_predict(x) @@ -124,7 +126,15 @@ def kmeans_sm(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval= return labels -def gaussian_mixture_sm(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval=False, keep_first=True, positive_eigs=False): +def gaussian_mixture_sm( + eigvals, + eigvecs, + n_clusters, + n_components=None, + weight_by_eigval=False, + keep_first=True, + positive_eigs=False, +): """ Extends the Shi and Malik method to a GMM with no covariance constraints. * Use the eigenvectors of L_rw (this doesn't matter for us). @@ -136,18 +146,24 @@ def gaussian_mixture_sm(eigvals, eigvecs, n_clusters, n_components=None, weight_ model = GaussianMixture( n_components=n_clusters, - covariance_type='full', + covariance_type="full", tol=0.001, reg_covar=1e-06, n_init=100, max_iter=1000, - init_params='kmeans', + init_params="kmeans", random_state=42, - verbose=0 + verbose=0, ) x, mask = _extract_eigs( - eigvals, eigvecs, n_clusters, n_components, weight_by_eigval, keep_first, positive_eigs + eigvals, + eigvecs, + n_clusters, + n_components, + weight_by_eigval, + keep_first, + positive_eigs, ) labels = np.full(len(mask), n_clusters) labels[mask] = model.fit_predict(x) @@ -158,7 +174,15 @@ def gaussian_mixture_sm(eigvals, eigvecs, n_clusters, n_components=None, weight_ return labels -def kmeans_njw(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval=False, keep_first=True, positive_eigs=False): +def kmeans_njw( + eigvals, + eigvecs, + n_clusters, + n_components=None, + weight_by_eigval=False, + keep_first=True, + positive_eigs=False, +): """ Ng, Jordan and Weiss (2002) * Use the eigenvectors of L_sym (this doesn't matter for us). @@ -168,18 +192,24 @@ def kmeans_njw(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval model = KMeans( n_clusters=n_clusters, - init='k-means++', + init="k-means++", n_init=100, max_iter=10000, tol=0.00001, verbose=0, random_state=42, n_jobs=32, - algorithm='auto', + algorithm="auto", ) x, mask = _extract_eigs( - eigvals, eigvecs, n_clusters, n_components, weight_by_eigval, keep_first, positive_eigs + eigvals, + eigvecs, + n_clusters, + n_components, + weight_by_eigval, + keep_first, + positive_eigs, ) # Normalize rows to norm 1 @@ -191,7 +221,15 @@ def kmeans_njw(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval return labels -def gaussian_mixture_njw(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval=False, keep_first=True, positive_eigs=False): +def gaussian_mixture_njw( + eigvals, + eigvecs, + n_clusters, + n_components=None, + weight_by_eigval=False, + keep_first=True, + positive_eigs=False, +): """ Extends the Shi and Malik method to a GMM with no covariance constraints. * Use the eigenvectors of L_rw (this doesn't matter for us). @@ -203,18 +241,24 @@ def gaussian_mixture_njw(eigvals, eigvecs, n_clusters, n_components=None, weight model = GaussianMixture( n_components=n_clusters, - covariance_type='full', + covariance_type="full", tol=0.001, reg_covar=1e-06, n_init=100, max_iter=1000, - init_params='kmeans', + init_params="kmeans", random_state=42, - verbose=0 + verbose=0, ) x, mask = _extract_eigs( - eigvals, eigvecs, n_clusters, n_components, weight_by_eigval, keep_first, positive_eigs + eigvals, + eigvecs, + n_clusters, + n_components, + weight_by_eigval, + keep_first, + positive_eigs, ) # Normalize rows to norm 1 @@ -229,7 +273,15 @@ def gaussian_mixture_njw(eigvals, eigvecs, n_clusters, n_components=None, weight return labels -def discretize_ys(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval=False, keep_first=True, positive_eigs=False): +def discretize_ys( + eigvals, + eigvecs, + n_clusters, + n_components=None, + weight_by_eigval=False, + keep_first=True, + positive_eigs=False, +): """ Yu and Shi (2003) * Use the unit-norm eigenvectors of L_rw (this doesn't matter for us). @@ -249,7 +301,13 @@ def discretize_ys(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eig from sklearn.cluster._spectral import discretize x, mask = _extract_eigs( - eigvals, eigvecs, n_clusters, n_components, weight_by_eigval, keep_first, positive_eigs + eigvals, + eigvecs, + n_clusters, + n_components, + weight_by_eigval, + keep_first, + positive_eigs, ) labels = np.full(len(mask), n_clusters) @@ -259,7 +317,15 @@ def discretize_ys(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eig return labels -def spherical_kmeans(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval=False, keep_first=True, positive_eigs=False): +def spherical_kmeans( + eigvals, + eigvecs, + n_clusters, + n_components=None, + weight_by_eigval=False, + keep_first=True, + positive_eigs=False, +): """ Project points onto the unit hypersphere and cluster. * Use the unit-norm eigenvectors of L_rw or L_sym (doesn't matter for us). @@ -276,7 +342,7 @@ def spherical_kmeans(eigvals, eigvecs, n_clusters, n_components=None, weight_by_ model = SphericalKMeans( n_clusters=n_clusters, - init='k-means++', + init="k-means++", n_init=100, max_iter=10000, tol=0.00001, @@ -286,7 +352,13 @@ def spherical_kmeans(eigvals, eigvecs, n_clusters, n_components=None, weight_by_ ) x, mask = _extract_eigs( - eigvals, eigvecs, n_clusters, n_components, weight_by_eigval, keep_first, positive_eigs + eigvals, + eigvecs, + n_clusters, + n_components, + weight_by_eigval, + keep_first, + positive_eigs, ) # Normalize rows to norm 1 @@ -298,7 +370,15 @@ def spherical_kmeans(eigvals, eigvecs, n_clusters, n_components=None, weight_by_ return labels -def vonmises_mixture(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval=False, keep_first=True, positive_eigs=False): +def vonmises_mixture( + eigvals, + eigvecs, + n_clusters, + n_components=None, + weight_by_eigval=False, + keep_first=True, + positive_eigs=False, +): """ Extend the spherical kmeans method to a von Mises-Fisher (gaussian on a sphere) MM. * Use the unit-norm eigenvectors of L_rw or L_sym (doesn't matter for us). @@ -325,7 +405,13 @@ def vonmises_mixture(eigvals, eigvecs, n_clusters, n_components=None, weight_by_ ) x, mask = _extract_eigs( - eigvals, eigvecs, n_clusters, n_components, weight_by_eigval, keep_first, positive_eigs + eigvals, + eigvecs, + n_clusters, + n_components, + weight_by_eigval, + keep_first, + positive_eigs, ) # Normalize rows to norm 1 @@ -338,30 +424,44 @@ def vonmises_mixture(eigvals, eigvecs, n_clusters, n_components=None, weight_by_ return labels -def gaussian_hmm(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigval=False, keep_first=True, positive_eigs=False): +def gaussian_hmm( + eigvals, + eigvecs, + n_clusters, + n_components=None, + weight_by_eigval=False, + keep_first=True, + positive_eigs=False, +): """ - Model the rows (points) as an HMM on k different latent k-dimensional + Model the rows (points) as an HMM on k different latent k-dimensional Gaussian states. """ from hmmlearn.hmm import GaussianHMM from sklearn.cluster import KMeans x, mask = _extract_eigs( - eigvals, eigvecs, n_clusters, n_components, weight_by_eigval, keep_first, positive_eigs + eigvals, + eigvecs, + n_clusters, + n_components, + weight_by_eigval, + keep_first, + positive_eigs, ) NULL = -100 kmeans = KMeans( n_clusters=n_clusters, - init='k-means++', + init="k-means++", n_init=100, max_iter=10000, tol=0.00001, - precompute_distances='auto', + precompute_distances="auto", verbose=0, random_state=42, copy_x=True, n_jobs=32, - algorithm='auto', + algorithm="auto", ) kmeans.fit(x) means_init = kmeans.cluster_centers_.tolist() @@ -369,27 +469,30 @@ def gaussian_hmm(eigvals, eigvecs, n_clusters, n_components=None, weight_by_eigv means_init = np.array(means_init) X = _extract_eigs( - eigvals, eigvecs, n_clusters, n_components, weight_by_eigval, keep_first, filter_nans=False, positive_eigs=positive_eigs + eigvals, + eigvecs, + n_clusters, + n_components, + weight_by_eigval, + keep_first, + filter_nans=False, + positive_eigs=positive_eigs, ) X[np.isnan(X)] = NULL hmm = GaussianHMM( n_components=n_clusters + 1, # add 1 explicit NULL state - covariance_type='full', + covariance_type="full", min_covar=0.0001, n_iter=100, random_state=42, - init_params='stc', + init_params="stc", ) hmm.means_ = means_init hmm.fit(X) Y = hmm.predict(X) key = -hmm.means_.min(axis=1) - relabel = dict(zip( - range(n_clusters + 1), - np.argsort(key) - )) + relabel = dict(zip(range(n_clusters + 1), np.argsort(key))) labels = np.array([relabel[y] for y in Y]) return labels - diff --git a/bin/utils/common.py b/bin/utils/common.py index 0f9e093..83dff9a 100755 --- a/bin/utils/common.py +++ b/bin/utils/common.py @@ -1,14 +1,15 @@ #!/usr/bin/env python3 +import os from functools import partial + +import bbi import bioframe +import boto3 import numpy as np import pandas as pd -import bbi -import boto3 from botocore import UNSIGNED from botocore.config import Config -import os def s3_to_local(s3_path, local_path): @@ -16,16 +17,16 @@ def s3_to_local(s3_path, local_path): Convert s3 paths to local file paths for pybbi to process. """ # s3 = boto3.resource('s3') - s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) + s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED)) - path_parts = s3_path.replace("s3://","").split("/") - bucket=path_parts.pop(0) - key="/".join(path_parts) + path_parts = s3_path.replace("s3://", "").split("/") + bucket = path_parts.pop(0) + key = "/".join(path_parts) if not os.path.exists(local_path): os.makedirs(local_path) - filename = key.rsplit('/', 1)[-1] + filename = key.rsplit("/", 1)[-1] # s3.Object(bucket, key).download_file(local_path + filename) s3.download_file(bucket, key, local_path + filename) local_file = f"{local_path}{filename}" @@ -33,10 +34,10 @@ def s3_to_local(s3_path, local_path): return local_file -def make_chromarms(chromsizes, mids, binsize=None, suffixes=('p', 'q')): +def make_chromarms(chromsizes, mids, binsize=None, suffixes=("p", "q")): """ Split chromosomes into chromosome arms - + Parameters ---------- chromsizes : pandas.Series @@ -48,7 +49,7 @@ def make_chromarms(chromsizes, mids, binsize=None, suffixes=('p', 'q')): bin grid. suffixes : tuple, optional Suffixes to name chromosome arms. Defaults to p and q. - + Returns ------- 4-column BED-like DataFrame (chrom, start, end, name). @@ -57,10 +58,7 @@ def make_chromarms(chromsizes, mids, binsize=None, suffixes=('p', 'q')): """ chromosomes = [chrom for chrom in chromsizes.index if chrom in mids] - p_arms = [ - [chrom, 0, mids[chrom], chrom + suffixes[0]] - for chrom in chromosomes - ] + p_arms = [[chrom, 0, mids[chrom], chrom + suffixes[0]] for chrom in chromosomes] if binsize is not None: for x in p_arms: x[2] = int(round(x[2] / binsize)) * binsize @@ -75,30 +73,28 @@ def make_chromarms(chromsizes, mids, binsize=None, suffixes=('p', 'q')): interleaved = [*sum(zip(p_arms, q_arms), ())] - return pd.DataFrame( - interleaved, - columns=['chrom', 'start', 'end', 'name'] - ) + return pd.DataFrame(interleaved, columns=["chrom", "start", "end", "name"]) def assign_arms(df, arms): g = { - arm['name']: bioframe.select( - df, (arm.chrom, arm.start, arm.end) - ).assign(arm=arm['name']) for _, arm in arms.iterrows() + arm["name"]: bioframe.select(df, (arm.chrom, arm.start, arm.end)).assign( + arm=arm["name"] + ) + for _, arm in arms.iterrows() } return pd.concat(g.values(), ignore_index=True) def assign_centel(group, arms): this_arm = group.name - if group.name.endswith('p'): - arm_len = arms.loc[this_arm, 'end'] - return 1 - (group['end'] / arm_len) + if group.name.endswith("p"): + arm_len = arms.loc[this_arm, "end"] + return 1 - (group["end"] / arm_len) else: # q-arm or acrocentric - arm_start = arms.loc[this_arm, 'start'] - arm_len = arms.loc[this_arm, 'end'] - arm_start - return (group['end'] - arm_start) / arm_len + arm_start = arms.loc[this_arm, "start"] + arm_len = arms.loc[this_arm, "end"] - arm_start + return (group["end"] - arm_start) / arm_len def _fetch_binned_chrom(path, chromsizes, local_path, binsize, chrom): @@ -116,6 +112,7 @@ def _fetch_binned_chrom(path, chromsizes, local_path, binsize, chrom): def fetch_binned(path, chromsizes, local_path, chromosomes, binsize, map): - out = map(partial(_fetch_binned_chrom, path, chromsizes, local_path, binsize), chromosomes) + out = map( + partial(_fetch_binned_chrom, path, chromsizes, local_path, binsize), chromosomes + ) return np.concatenate(list(out)) - diff --git a/bin/utils/df2multivec.py b/bin/utils/df2multivec.py index 646366f..48c3f7f 100644 --- a/bin/utils/df2multivec.py +++ b/bin/utils/df2multivec.py @@ -1,13 +1,11 @@ #!/usr/bin/env python3 -from math import ceil -import os.path as op import math +from math import ceil -from scipy.special import logsumexp -import numpy as np -import pandas as pd import h5py +import numpy as np +from scipy.special import logsumexp def nansum_agg(x): @@ -35,16 +33,13 @@ def to_multivec( h5opts=None, ): if h5opts is None: - h5opts = { - 'compression': 'gzip', - 'compression_opts': 6, - 'shuffle': True - } + h5opts = {"compression": "gzip", "compression_opts": 6, "shuffle": True} chromosomes = list(chromsizes.keys()) - grouped = df.groupby('chrom') + grouped = df.groupby("chrom") array_dict = { - chrom: grouped.get_group(chrom).loc[:, feature_names].values for chrom in chromosomes + chrom: grouped.get_group(chrom).loc[:, feature_names].values + for chrom in chromosomes } chroms, lengths = zip(*chromsizes.items()) chrom_array = np.array(chroms, dtype="S") @@ -64,10 +59,7 @@ def to_multivec( **h5opts ) f["chroms"].create_dataset( - "length", - shape=(len(chroms),), - data=lengths, - **h5opts + "length", shape=(len(chroms),), data=lengths, **h5opts ) # the data goes here @@ -98,19 +90,15 @@ def to_multivec( continue dset = grp["values"].create_dataset( - str(chrom), - array_dict[chrom].shape, - **h5opts + str(chrom), array_dict[chrom].shape, **h5opts ) start = 0 step = int(min(chunksize, len(dset))) while start < len(dset): # see above section - dset[start : start + step] = \ - array_dict[chrom][start : start + step] + dset[start : start + step] = array_dict[chrom][start : start + step] start += int(min(chunksize, len(array_dict[chrom]) - start)) - # we're going to go through and create the data for the different # zoom levels by summing adjacent data points prev_res = res @@ -136,12 +124,8 @@ def to_multivec( start = 0 step = int(min(chunksize, len(prev_dset))) - shape = (int(ceil(prev_dset.shape[0]/2)), prev_dset.shape[1]) - dset = grp["values"].create_dataset( - chrom, - shape, - **h5opts - ) + shape = (int(ceil(prev_dset.shape[0] / 2)), prev_dset.shape[1]) + dset = grp["values"].create_dataset(chrom, shape, **h5opts) while start < len(prev_dset): prev_data = prev_dset[start : start + step] @@ -160,4 +144,3 @@ def to_multivec( start += int(min(chunksize, len(prev_dset) - start)) prev_res = res - \ No newline at end of file diff --git a/bin/utils/eigdecomp.py b/bin/utils/eigdecomp.py index dfc2fc0..6a6cf5a 100644 --- a/bin/utils/eigdecomp.py +++ b/bin/utils/eigdecomp.py @@ -1,15 +1,12 @@ #!/usr/bin/env python3 +import bioframe import numpy as np import pandas as pd import scipy.sparse import scipy.stats - +from cooltools.api.eigdecomp import _fake_cis, _filter_heatmap from cooltools.lib import numutils -from cooltools.api.eigdecomp import _filter_heatmap, _fake_cis - -import bioframe -import cooler def _orient_eigs(eigvecs, phasing_track, corr_metric=None): @@ -33,7 +30,6 @@ def _orient_eigs(eigvecs, phasing_track, corr_metric=None): This function does NOT change the order of the eigenvectors. """ for i in range(eigvecs.shape[1]): - mask = np.isfinite(eigvecs[:, i]) & np.isfinite(phasing_track) if corr_metric is None or corr_metric == "spearmanr": @@ -45,9 +41,8 @@ def _orient_eigs(eigvecs, phasing_track, corr_metric=None): # multiply by the sign to keep the phasing information corr = np.sign(corr) * corr * corr * np.var(eigvecs[mask, i]) elif corr_metric == "MAD_explained": - corr = ( - numutils.COMED(phasing_track[mask], eigvecs[mask, i]) * - numutils.MAD(eigvecs[mask, i]) + corr = numutils.COMED(phasing_track[mask], eigvecs[mask, i]) * numutils.MAD( + eigvecs[mask, i] ) else: raise ValueError("Unknown correlation metric: {}".format(corr_metric)) @@ -72,9 +67,9 @@ def _normalized_affinity_matrix_from_trans(A, partition, perc_top, perc_bottom): whose last element is the last bin of the last chromosome. perc_top : float Clip trans blowout pixels above this cutoff. - perc_bottom : + perc_bottom : Mask bins with trans coverage below this cutoff. - + Returns ------- 2D array (n, n) @@ -108,7 +103,9 @@ def _normalized_affinity_matrix_from_trans(A, partition, perc_top, perc_bottom): A[:, is_bad_bin] = 0 if np.any(~np.isfinite(A)): - raise ValueError("Matrix A contains point-wise NaNs or infinite values, not expected for cooler-loaded maps") + raise ValueError( + "Matrix A contains point-wise NaNs or infinite values, not expected for cooler-loaded maps" + ) # Filter the heatmap is_good_bin = ~is_bad_bin @@ -149,12 +146,12 @@ def eig_trans( perc_top=99.95, phasing_track_col="GC", corr_metric=None, - which='LM', + which="LM", ): """ - Spectral decomposition of trans Hi-C data derived from a normalized + Spectral decomposition of trans Hi-C data derived from a normalized affinity representation. - Each eigenvector is deterministically oriented with respect to a provided + Each eigenvector is deterministically oriented with respect to a provided "phasing track" (e.g. GC content). Parameters ---------- @@ -176,14 +173,14 @@ def eig_trans( Clip trans blowout pixels above this cutoff. perc_bottom : float Mask bins with trans coverage below this cutoff. - phasing_track_col : - Column of bin table to use for deterministically orienting the + phasing_track_col : + Column of bin table to use for deterministically orienting the eigenvectors. corr_metric : str Correlation metric to use for selecting orientations. which : str Code for the eigenvalue order in which components are calculated. - (LM = largest/descending magnitude/modulus; LA = largest/descending + (LM = largest/descending magnitude/modulus; LA = largest/descending algebraic value). Returns ------- @@ -193,12 +190,12 @@ def eig_trans( Table of eigenvectors (as columns). Notes ----- - This is very similar to the trans eigendecomposition method from - Imakaev et al. 2012 and the implementation in cooltools but differs in - how the matrix is normalized before being decomposed. The main impact is - that the eigen*value* spectra end up being standardized and thus easier to + This is very similar to the trans eigendecomposition method from + Imakaev et al. 2012 and the implementation in cooltools but differs in + how the matrix is normalized before being decomposed. The main impact is + that the eigen*value* spectra end up being standardized and thus easier to assess and compare between datasets. Moreover, because the matrix is not - mean-centered before decomposition, an additional trivial eigenvector will + mean-centered before decomposition, an additional trivial eigenvector will be produced having eigenvalue 1. Hence, we return n_eigs + 1 vectors. For more details, see the Supplemental Note of Spracklin, Abdennur et al., 2021: https://www.biorxiv.org/content/10.1101/2021.08.05.455340v1.supplementary-material @@ -214,8 +211,8 @@ def eig_trans( bins = bins[lo:hi] # Apply blacklist if available. - if 'is_bad' in bins.columns: - mask = bins['is_bad'].values.astype(bool) + if "is_bad" in bins.columns: + mask = bins["is_bad"].values.astype(bool) A[mask, :] = np.nan A[:, mask] = np.nan @@ -229,9 +226,7 @@ def eig_trans( phasing_track = bins[phasing_track_col].values[lo:hi] # Compute the affinity matrix. - A = _normalized_affinity_matrix_from_trans( - A, partition, perc_top, perc_bottom - ) + A = _normalized_affinity_matrix_from_trans(A, partition, perc_top, perc_bottom) # Compute eigs on the doubly stochastic affinity matrix # We actually extract n + 1 eigenvectors. @@ -239,9 +234,7 @@ def eig_trans( mask = np.sum(np.abs(A), axis=0) != 0 A_collapsed = A[mask, :][:, mask].astype(np.float64, copy=True) eigvals, eigvecs_collapsed = scipy.sparse.linalg.eigsh( - A_collapsed, - n_eigs + 1, - which=which + A_collapsed, n_eigs + 1, which=which ) eigvecs = np.full((len(mask), n_eigs + 1), np.nan) eigvecs[mask, :] = eigvecs_collapsed @@ -256,10 +249,12 @@ def eig_trans( eigvecs = _orient_eigs(eigvecs, phasing_track, corr_metric) # Prepare outputs. - eigval_table = pd.DataFrame({ - 'eig': ["E{}".format(i) for i in range(n_eigs + 1)], - 'val': eigvals, - }) + eigval_table = pd.DataFrame( + { + "eig": ["E{}".format(i) for i in range(n_eigs + 1)], + "val": eigvals, + } + ) eigvec_table = bins.copy() for i in range(n_eigs + 1): eigvec_table["E{}".format(i)] = eigvecs[:, i].copy() @@ -295,7 +290,9 @@ def _normalized_affinity_matrix_from_cis(A, perc_top, perc_bottom, ignore_diags= n_bins = A.shape[0] if ignore_diags >= n_bins: - raise ValueError("Number of ignored diagonals should be less than the number of bins") + raise ValueError( + "Number of ignored diagonals should be less than the number of bins" + ) # Zero out bins nulled out using NaNs is_bad_bin = np.nansum(A, axis=0) == 0 @@ -303,7 +300,9 @@ def _normalized_affinity_matrix_from_cis(A, perc_top, perc_bottom, ignore_diags= A[:, is_bad_bin] = 0 if np.any(~np.isfinite(A)): - raise ValueError("Matrix A contains point-wise NaNs or infinite values, not expected for cooler-loaded maps") + raise ValueError( + "Matrix A contains point-wise NaNs or infinite values, not expected for cooler-loaded maps" + ) # Additional checks for ignore_diags after is_good_bin = ~is_bad_bin @@ -342,6 +341,7 @@ def _normalized_affinity_matrix_from_cis(A, perc_top, perc_bottom, ignore_diags= return OE + def eig_cis( clr, bins, @@ -353,7 +353,7 @@ def eig_cis( perc_top=99.95, phasing_track_col="GC", corr_metric=None, - which='LM', + which="LM", ): """ Spectral decomposition of cis Hi-C data derived from a normalized @@ -407,7 +407,10 @@ def eig_cis( # get chromosomes from cooler, if view_df not specified: if view_df is None: view_df = bioframe.make_viewframe( - [(chrom, 0, clr.chromsizes[chrom]) for chrom in clr.chromnames and chrom in chomosomes_bins] + [ + (chrom, 0, clr.chromsizes[chrom]) + for chrom in clr.chromnames and chrom in chomosomes_bins + ] ) else: # appropriate viewframe checks: @@ -429,14 +432,12 @@ def eig_cis( # Prepare output table with eigenvectors eigvec_table = bins.copy() - eigvec_columns = [f"E{i}" for i in range(n_eigs+1)] + eigvec_columns = [f"E{i}" for i in range(n_eigs + 1)] for ev_col in eigvec_columns: eigvec_table[ev_col] = np.nan # Prepare output table with eigenvalues - eigval_table = pd.DataFrame({ - 'eig': eigvec_columns - }) + eigval_table = pd.DataFrame({"eig": eigvec_columns}) def _each(region): """ @@ -457,8 +458,8 @@ def _each(region): A = clr.matrix(balance=balance).fetch(_region) # Apply blacklist if available. - if 'is_bad' in bins.columns: - mask = bioframe.select(bins, _region)['is_bad'].values.astype(bool) + if "is_bad" in bins.columns: + mask = bioframe.select(bins, _region)["is_bad"].values.astype(bool) A[mask, :] = np.nan A[:, mask] = np.nan @@ -476,14 +477,18 @@ def _each(region): # The first eigenvector, E0, will be uniform with eigenvalue 1. mask = np.sum(np.abs(A), axis=0) != 0 - if (A.shape[0] <= ignore_diags + n_eigs + 1) or (mask.sum() <= ignore_diags + n_eigs + 1): - return _region, np.nan * np.ones(n_eigs+1), np.nan * np.ones((len(A), n_eigs+1)) + if (A.shape[0] <= ignore_diags + n_eigs + 1) or ( + mask.sum() <= ignore_diags + n_eigs + 1 + ): + return ( + _region, + np.nan * np.ones(n_eigs + 1), + np.nan * np.ones((len(A), n_eigs + 1)), + ) A_collapsed = A[mask, :][:, mask].astype(np.float64, copy=True) eigvals, eigvecs_collapsed = scipy.sparse.linalg.eigsh( - A_collapsed, - n_eigs + 1, - which=which + A_collapsed, n_eigs + 1, which=which ) eigvecs = np.full((len(mask), n_eigs + 1), np.nan) eigvecs[mask, :] = eigvecs_collapsed @@ -505,14 +510,13 @@ def _each(region): # Go through eigendecomposition results and fill in # Output table eigvec_table and eigvals_table for _region, _eigvals, _eigvecs in results: - - if len(_region)==3: - region_name = f'{_region[0]}:{_region[1]}-{_region[2]}' + if len(_region) == 3: + region_name = f"{_region[0]}:{_region[1]}-{_region[2]}" else: region_name = _region[3] idx = bioframe.select(eigvec_table, _region).index.values - eigvec_table.loc[idx, 'name'] = region_name + eigvec_table.loc[idx, "name"] = region_name eigvec_table.loc[idx, eigvec_columns] = _eigvecs eigval_table.loc[:, region_name] = _eigvals @@ -520,7 +524,6 @@ def _each(region): return eigval_table, eigvec_table - def projection(A, eigs, n_eigs=None): """ Project matrix A to the eigenvectors eigs. @@ -539,11 +542,8 @@ def projection(A, eigs, n_eigs=None): raise ValueError(f"Matrix and eigs shape mismatch: {n_bins} {len(eigs)}") # Filter out missing data: - E = eigs.loc[:, 'E0':f'E{n_eigs}'].to_numpy().astype(np.float64) - mask = ( - (np.sum(np.abs(A), axis=0) != 0) & - (np.sum(np.isnan(E), axis=1) == 0) - ) + E = eigs.loc[:, "E0":f"E{n_eigs}"].to_numpy().astype(np.float64) + mask = (np.sum(np.abs(A), axis=0) != 0) & (np.sum(np.isnan(E), axis=1) == 0) A_collapsed = A[mask, :][:, mask].astype(float, copy=True) E_collapsed = E[mask, :] @@ -555,10 +555,11 @@ def projection(A, eigs, n_eigs=None): proj[mask, :] = np.hstack(result) # Create projection table: - proj_table = eigs.loc[:, ['chrom', 'start', 'end']].copy() + proj_table = eigs.loc[:, ["chrom", "start", "end"]].copy() for i in range(n_eigs + 1): proj_table["E{}".format(i)] = proj[:, i].copy() - proj_table["E{}".format(i)] /= np.linalg.norm(proj_table["E{}".format(i)].dropna()) + proj_table["E{}".format(i)] /= np.linalg.norm( + proj_table["E{}".format(i)].dropna() + ) return proj_table - \ No newline at end of file diff --git a/bin/utils/plotting.py b/bin/utils/plotting.py index cfc1a82..84d417c 100644 --- a/bin/utils/plotting.py +++ b/bin/utils/plotting.py @@ -1,16 +1,15 @@ #!/usr/bin/env python3 -import numpy as np -import pandas as pd -from cooltools.lib import numutils, runlength +import datashader as ds import matplotlib.pyplot as plt +import numpy as np import seaborn as sns -from mpl_toolkits.axes_grid1 import ImageGrid +from cooltools.lib import numutils, runlength from datashader.mpl_ext import dsshow -import datashader as ds -import datashader.transfer_functions as tf -plt.rcParams['pdf.fonttype'] = 42 -plt.rcParams['svg.fonttype'] = 'none' +from mpl_toolkits.axes_grid1 import ImageGrid + +plt.rcParams["pdf.fonttype"] = 42 +plt.rcParams["svg.fonttype"] = "none" def plot_spectrum(eigval_df, n_eigs_display, title, outpath): @@ -19,29 +18,22 @@ def plot_spectrum(eigval_df, n_eigs_display, title, outpath): plt.suptitle(title) plt.subplot(gs[0]) plt.stem( - eigval_df['eig'][:n_eigs_display + 1], - eigval_df['val'][:n_eigs_display + 1] + eigval_df["eig"][: n_eigs_display + 1], eigval_df["val"][: n_eigs_display + 1] ) plt.subplot(gs[1]) - sns.rugplot(eigval_df['val']) - sns.kdeplot(eigval_df['val'], bw_adjust=0.5) + sns.rugplot(eigval_df["val"]) + sns.kdeplot(eigval_df["val"], bw_adjust=0.5) plt.xlim(-1, 1) return fig def plot_heatmap( - idx, - eigs, - bins, - trackconfs, - blocks, - coarse_factor=32, - options_default=None + idx, eigs, bins, trackconfs, blocks, coarse_factor=32, options_default=None ): if options_default is None: options_default = { - 'cmap': 'Reds', - 'vmin': 0, + "cmap": "Reds", + "vmin": 0, } E = eigs.values[idx, :].T labels = bins["cluster"].values @@ -60,27 +52,22 @@ def plot_heatmap( # Eig block ax = ax1 = plt.subplot(gs[0]) - X = numutils.coarsen( - np.nanmean, - E, - {1: coarse_factor}, - trim_excess=True - ) + X = numutils.coarsen(np.nanmean, E, {1: coarse_factor}, trim_excess=True) ax.matshow( X, rasterized=True, extent=extent, - cmap='RdBu_r', + cmap="RdBu_r", vmin=-0.005, vmax=0.005, ) - ax.set_aspect('auto') + ax.set_aspect("auto") ax.xaxis.set_visible(False) ax.set_yticks(np.arange(E.shape[0])) ax.set_xlim(-0.5, E.shape[1] - 0.5) ax.set_ylim(E.shape[0] - 0.5, -0.5) - ax.set_yticklabels([f'E{i}' for i in range(1, E.shape[0] + 1)]) - plt.vlines(lines, -0.5, E.shape[1]-0.5, lw=1, color='k') + ax.set_yticklabels([f"E{i}" for i in range(1, E.shape[0] + 1)]) + plt.vlines(lines, -0.5, E.shape[1] - 0.5, lw=1, color="k") # plt.colorbar(im) level = 1 @@ -92,41 +79,30 @@ def plot_heatmap( x = bins[track_name].values[idx] track_conf = trackconfs[track_name] - track_type = track_conf.get('type', 'scalar') - kwargs = track_conf.get('options', {}) + track_type = track_conf.get("type", "scalar") + kwargs = track_conf.get("options", {}) - kwargs.setdefault('cmap', - 'RdBu_r' if track_type == 'divergent' else 'Reds' - ) - kwargs.setdefault('vmin', 0) - if 'vmax' not in kwargs: + kwargs.setdefault("cmap", "RdBu_r" if track_type == "divergent" else "Reds") + kwargs.setdefault("vmin", 0) + if "vmax" not in kwargs: vopt = np.percentile(np.max(np.abs(x)), 90) - if track_type == 'divergent': - kwargs['vmin'] = -vopt + if track_type == "divergent": + kwargs["vmin"] = -vopt else: - kwargs['vmin'] = 0 - kwargs['vmax'] = vopt + kwargs["vmin"] = 0 + kwargs["vmax"] = vopt X = numutils.coarsen( - np.nanmean, - np.array([x]), - {1: coarse_factor}, - trim_excess=True - ) - im = ax.matshow( - X, - rasterized=True, - extent=extent, - origin='lower', - **kwargs + np.nanmean, np.array([x]), {1: coarse_factor}, trim_excess=True ) - ax.set_aspect('auto') + im = ax.matshow(X, rasterized=True, extent=extent, origin="lower", **kwargs) + ax.set_aspect("auto") ax.xaxis.set_visible(False) ax.set_xlim(lo - 0.5, hi - 0.5) ax.set_ylim(-0.5, 0.5) ax.set_yticks([0]) ax.set_yticklabels([track_name]) - plt.vlines(lines, -0.5, 0.5, lw=1, color='k') + plt.vlines(lines, -0.5, 0.5, lw=1, color="k") # plt.colorbar(im) level += 1 return fig @@ -134,52 +110,47 @@ def plot_heatmap( def plot_scatters(eigs, bins, trackconfs, panels): ncols = 4 - xvar = 'E1' - yvar = 'E2' + xvar = "E1" + yvar = "E2" lo, hi = -0.015, 0.015 tick_lo, tick_hi = -0.01, 0.01 ds_options = { - 'glyph': ds.Point(xvar, yvar), - 'x_range': [lo, hi], - 'y_range': [lo, hi], + "glyph": ds.Point(xvar, yvar), + "x_range": [lo, hi], + "y_range": [lo, hi], # 'shade_hook': tf.dynspread, #partial(tf.dynspread, threshold=0.75, max_px=4), - 'aspect': 'equal', - 'rasterized': True, - 'interpolation': 'none' + "aspect": "equal", + "rasterized": True, + "interpolation": "none", } for panel_name, track_list in panels.items(): - - nrows = int(np.ceil((len(track_list) + 1)/ncols)) + nrows = int(np.ceil((len(track_list) + 1) / ncols)) gridshape = (nrows, ncols) fig = plt.figure(figsize=(3 * ncols, 3 * nrows)) grid = ImageGrid( - fig, 111, gridshape, + fig, + 111, + gridshape, share_all=True, cbar_mode="each", cbar_pad=0.05, - axes_pad=0.5 + axes_pad=0.5, ) grid[0].set_xticks([tick_lo, 0, tick_hi]) grid[0].set_yticks([tick_lo, 0, tick_hi]) for i in range(gridshape[0]): - grid[i*gridshape[1]].set_ylabel(yvar) + grid[i * gridshape[1]].set_ylabel(yvar) for j in range(gridshape[1]): - grid[gridshape[1]*(gridshape[0] - 1) + j].set_xlabel(xvar) + grid[gridshape[1] * (gridshape[0] - 1) + j].set_xlabel(xvar) # Density ax = grid[0] cax = grid.cbar_axes[0] - da = dsshow( - eigs, - norm='linear', - cmap='viridis', - ax=ax, - **ds_options - ) - ax.set_title('density') - ax.axvline(0, c='k', lw=0.5, ls='--', alpha=0.5) - ax.axhline(0, c='k', lw=0.5, ls='--', alpha=0.5) + da = dsshow(eigs, norm="linear", cmap="viridis", ax=ax, **ds_options) + ax.set_title("density") + ax.axvline(0, c="k", lw=0.5, ls="--", alpha=0.5) + ax.axhline(0, c="k", lw=0.5, ls="--", alpha=0.5) plt.colorbar(da, cax=cax) # Other signals @@ -188,58 +159,44 @@ def plot_scatters(eigs, bins, trackconfs, panels): cax = grid.cbar_axes[i] track_conf = trackconfs[track_name] - track_type = track_conf.get('type', 'scalar') - kwargs = track_conf.get('options', {}) - - if track_type == 'category': - df = eigs.assign( - z=bins[track_name].astype('category') - ) - kwargs['ax'] = ax - if 'facecolor' in kwargs: - ax.set_facecolor(kwargs.pop('facecolor')) - da = dsshow( - df, - aggregator=ds.count_cat('z'), - **kwargs, - **ds_options - ) - ax.set_title(track_name); - ax.axvline(0, c='k', lw=0.5, ls='--', alpha=0.5); - ax.axhline(0, c='k', lw=0.5, ls='--', alpha=0.5); + track_type = track_conf.get("type", "scalar") + kwargs = track_conf.get("options", {}) + + if track_type == "category": + df = eigs.assign(z=bins[track_name].astype("category")) + kwargs["ax"] = ax + if "facecolor" in kwargs: + ax.set_facecolor(kwargs.pop("facecolor")) + da = dsshow(df, aggregator=ds.count_cat("z"), **kwargs, **ds_options) + ax.set_title(track_name) + ax.axvline(0, c="k", lw=0.5, ls="--", alpha=0.5) + ax.axhline(0, c="k", lw=0.5, ls="--", alpha=0.5) cax.legend( handles=da.get_legend_elements(), fontsize=6, borderaxespad=0, - loc='upper left' - ); - cax.axis("off"); + loc="upper left", + ) + cax.axis("off") else: df = eigs.assign(z=bins[track_name]) - kwargs['ax'] = ax - kwargs.setdefault('norm', 'linear') - kwargs.setdefault('cmap', - 'RdBu_r' if track_type == 'divergent' else 'Oranges' + kwargs["ax"] = ax + kwargs.setdefault("norm", "linear") + kwargs.setdefault( + "cmap", "RdBu_r" if track_type == "divergent" else "Oranges" ) - kwargs.setdefault('vmin', 0) - if 'vmax' not in kwargs: - vopt = np.percentile(np.max(np.abs(df['z'])), 90) - if track_type == 'divergent': - kwargs['vmin'] = -vopt + kwargs.setdefault("vmin", 0) + if "vmax" not in kwargs: + vopt = np.percentile(np.max(np.abs(df["z"])), 90) + if track_type == "divergent": + kwargs["vmin"] = -vopt else: - kwargs['vmin'] = 0 - kwargs['vmax'] = vopt - if 'facecolor' in kwargs: - ax.set_facecolor(kwargs.pop('facecolor')) - da = dsshow( - df, - aggregator=ds.mean('z'), - **kwargs, - **ds_options - ) - ax.set_title(track_name); - ax.axvline(0, c='k', lw=0.5, ls='--', alpha=0.5); - ax.axhline(0, c='k', lw=0.5, ls='--', alpha=0.5); - plt.colorbar(da, cax=cax); - - \ No newline at end of file + kwargs["vmin"] = 0 + kwargs["vmax"] = vopt + if "facecolor" in kwargs: + ax.set_facecolor(kwargs.pop("facecolor")) + da = dsshow(df, aggregator=ds.mean("z"), **kwargs, **ds_options) + ax.set_title(track_name) + ax.axvline(0, c="k", lw=0.5, ls="--", alpha=0.5) + ax.axhline(0, c="k", lw=0.5, ls="--", alpha=0.5) + plt.colorbar(da, cax=cax)