Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: install pre-commit; remove unused imports, auto-order imports, etc. #12

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: debug-statements
- id: name-tests-test
- id: requirements-txt-fixer
- repo: https://github.com/asottile/setup-cfg-fmt
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can drop this one since there's no setup.cfg

rev: v2.4.0
hooks:
- id: setup-cfg-fmt
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
hooks:
- id: flake8
- repo: https://github.com/PyCQA/isort
Comment on lines +15 to +19
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we might want to consider replacing these with ruff

rev: 5.12.0
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.4.1
hooks:
- id: mypy
additional_dependencies: [types-all]
exclude: ^testing/resources/
- repo: https://github.com/psf/black
rev: 23.3.0
hooks:
- id: black
46 changes: 18 additions & 28 deletions bin/clustering_main.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,37 @@
#!/usr/bin/env python3

import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pyplot as plt
import matplotlib as mpl

from functools import partial
import os.path as op
import pathlib
mpl.use("Agg")
import argparse

from tqdm import tqdm
import bioframe
import cooler
import h5py
import numpy as np
import pandas as pd
import argparse
import yaml

import utils.clustering as cluster
import yaml

parser = argparse.ArgumentParser()

parser.add_argument('--config')
parser.add_argument('--eigvals', help='parquet file with eigenvalues', required=True)
parser.add_argument('--eigvecs', help='parquet file with eigenvectors', required=True)
parser.add_argument('--bins', help='parquet bins file', required=True)
parser.add_argument("--config")
parser.add_argument("--eigvals", help="parquet file with eigenvalues", required=True)
parser.add_argument("--eigvecs", help="parquet file with eigenvectors", required=True)
parser.add_argument("--bins", help="parquet bins file", required=True)

args = parser.parse_args()

with open(args.config, "r") as infile:
config = yaml.full_load(infile)

assembly = config["assembly"]
n_clusters = config["n_clusters"]
binsize = config["binsize"]
sample = config["sample"]
n_eigs = config["n_eigs"]

CHROMSIZES = bioframe.fetch_chromsizes(assembly)
CHROMOSOMES = list(CHROMSIZES[:'chrY'].index)
CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:'chr22'].index)
CHROMOSOMES = list(CHROMSIZES[:"chrY"].index)
CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:"chr22"].index)

try:
CENTROMERES = bioframe.fetch_centromeres(assembly)
Expand All @@ -54,21 +46,20 @@
cluster_sort_key = "GC"

eigvecs = pd.read_parquet(args.eigvecs)
eigvals = pd.read_parquet(args.eigvals).set_index('eig')
eigvecs = eigvecs[eigvecs['chrom'].isin(chromosomes)]
eigvals = pd.read_parquet(args.eigvals).set_index("eig")
eigvecs = eigvecs[eigvecs["chrom"].isin(chromosomes)]

# Use as many eigenvectors as initial positive eigenvalues
n_components = np.where(eigvals < 0)[0][0] - 1
print(f"Using {n_components} components for clustering...")

sorting_tracks = pd.read_parquet(args.bins)
sorting_tracks = sorting_tracks[sorting_tracks['chrom'].isin(chromosomes)]
sorting_tracks = sorting_tracks[sorting_tracks["chrom"].isin(chromosomes)]

out = eigvecs[['chrom', 'start', 'end']].copy()
out = eigvecs[["chrom", "start", "end"]].copy()

for n_cluster in n_clusters:

colname = f'kmeans_sm{n_cluster}'
colname = f"kmeans_sm{n_cluster}"

labels = cluster.kmeans_sm(
eigvals,
Expand All @@ -85,7 +76,6 @@
)

out[colname] = new_labels
out[colname + '_order'] = bin_ranks

out.to_csv(f"{sample}.{binsize}.E1-E{n_eigs}.kmeans_sm.tsv", sep='\t', index=False)
out[colname + "_order"] = bin_ranks

out.to_csv(f"{sample}.{binsize}.E1-E{n_eigs}.kmeans_sm.tsv", sep="\t", index=False)
59 changes: 22 additions & 37 deletions bin/eigdecomp_main.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,37 @@
#!/usr/bin/env python3

import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pyplot as plt

from functools import partial
import os.path as op
import pathlib
import matplotlib as mpl

mpl.use("Agg")
import argparse

from tqdm import tqdm
import bioframe
import cooler
import h5py
import numpy as np
import pandas as pd
import argparse
import yaml

import utils.common as common
import utils.eigdecomp as eig
import yaml

parser = argparse.ArgumentParser()
parser.add_argument('--config')
parser.add_argument('--bins', help='parquet bins file', required=True)
parser.add_argument('--blacklist', help='blacklist file path')
parser.add_argument('--cooler', help='cooler file path', required=True)
parser.add_argument("--config")
parser.add_argument("--bins", help="parquet bins file", required=True)
parser.add_argument("--blacklist", help="blacklist file path")
parser.add_argument("--cooler", help="cooler file path", required=True)

args = parser.parse_args()

with open(args.config, "r") as infile:
config = yaml.full_load(infile)

assembly = config["assembly"]
binsize = config["binsize"]
sample = config["sample"]
n_eigs = config["n_eigs"]
decomp_mode = config["decomp_mode"]

CHROMSIZES = bioframe.fetch_chromsizes(assembly)
CHROMOSOMES = list(CHROMSIZES[:'chrY'].index)
CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:'chr22'].index)
CHROMOSOMES = list(CHROMSIZES[:"chrY"].index)
CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:"chr22"].index)

try:
CENTROMERES = bioframe.fetch_centromeres(assembly)
Expand All @@ -50,29 +42,23 @@

# has a header (chrom, start, end, GC)
ref_track = pd.read_parquet(args.bins)
ref_track = ref_track[ref_track['chrom'].isin(chromosomes)]
ref_track = ref_track[ref_track["chrom"].isin(chromosomes)]

# include blacklist
if args.blacklist is not None:
# no header
blacklist = pd.read_csv(
args.blacklist,
sep='\t',
names=['chrom', 'start', 'end']
blacklist = pd.read_csv(args.blacklist, sep="\t", names=["chrom", "start", "end"])
ref_track = bioframe.count_overlaps(ref_track, blacklist).rename(
columns={"count": "is_bad"}
)
ref_track = (
bioframe.count_overlaps(ref_track, blacklist)
.rename(columns={'count': 'is_bad'})
)
ref_track = ref_track[ref_track['chrom'].isin(chromosomes)]
ref_track = ref_track[ref_track["chrom"].isin(chromosomes)]

path = args.cooler
clr = cooler.Cooler(f"{path}::resolutions/{binsize}")

if decomp_mode=="trans":
if decomp_mode == "trans":
partition = np.r_[
[clr.offset(chrom) for chrom in chromosomes],
clr.extent(chromosomes[-1])[1]
[clr.offset(chrom) for chrom in chromosomes], clr.extent(chromosomes[-1])[1]
]

eigval_df, eigvec_df = eig.eig_trans(
Expand All @@ -84,7 +70,7 @@
corr_metric=None,
)

elif decomp_mode=="cis":
elif decomp_mode == "cis":
viewframe_path = (args.assembly).get("viewframe_cis", None)
if viewframe_path is None:
CHROMARMS = bioframe.make_chromarms(CHROMSIZES, CENTROMERES)
Expand All @@ -98,13 +84,12 @@
phasing_track_col="GC",
n_eigs=n_eigs,
corr_metric=None,
ignore_diags=None, # will be inferred from cooler
view_df=viewframe
ignore_diags=None, # will be inferred from cooler
view_df=viewframe,
)
else:
raise ValueError(f"Mode {decomp_mode} is not implemented")

# Output
eigval_df.to_parquet(f"{sample}.{binsize}.E0-E{n_eigs}.{decomp_mode}.eigvals.pq")
eigvec_df.to_parquet(f"{sample}.{binsize}.E0-E{n_eigs}.{decomp_mode}.eigvecs.pq")

70 changes: 33 additions & 37 deletions bin/heatmap.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,27 @@
#!/usr/bin/env python3

import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pyplot as plt

from functools import partial
import matplotlib as mpl

mpl.use("Agg")
import argparse
import os.path as op
import pathlib

from tqdm import tqdm
import bioframe
import cooler
import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import argparse
import yaml

import utils.plotting as plotting
import yaml

parser = argparse.ArgumentParser()
parser.add_argument('--config')
parser.add_argument('--eigvals', help='parquet file with eigenvalues', required=True)
parser.add_argument('--eigvecs', help='parquet file with eigenvectors', required=True)
parser.add_argument('--bins', help='parquet bins file', required=True)
parser.add_argument('--cluster', help='tsv file with k-means clusters', required=True)
parser.add_argument('--track_db', help='track_db file', required=True)
parser.add_argument('--meta', help='bigwig metadata file', required=True)
parser.add_argument("--config")
parser.add_argument("--eigvals", help="parquet file with eigenvalues", required=True)
parser.add_argument("--eigvecs", help="parquet file with eigenvectors", required=True)
parser.add_argument("--bins", help="parquet bins file", required=True)
parser.add_argument("--cluster", help="tsv file with k-means clusters", required=True)
parser.add_argument("--track_db", help="track_db file", required=True)
parser.add_argument("--meta", help="bigwig metadata file", required=True)

args = parser.parse_args()

Expand All @@ -39,56 +34,57 @@
n_eigs = config["n_eigs"]

CHROMSIZES = bioframe.fetch_chromsizes(assembly)
CHROMOSOMES = list(CHROMSIZES[:'chrY'].index)
CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:'chr22'].index)
CHROMOSOMES = list(CHROMSIZES[:"chrY"].index)
CHROMOSOMES_FOR_CLUSTERING = list(CHROMSIZES[:"chr22"].index)

try:
CENTROMERES = bioframe.fetch_centromeres(assembly)
except ValueError:
CENTROMERES = None

chromosomes = CHROMOSOMES_FOR_CLUSTERING
sort_by = 'centel'
norm = 'sqrt'
sort_by = "centel"
norm = "sqrt"
n_eigs_heatmap = 10
n_clusters = config["n_clusters"]

eigvecs = pd.read_parquet(args.eigvecs)
eigvals = pd.read_parquet(args.eigvals).set_index('eig')['val']
sqrt_lam = np.sqrt(np.abs(eigvals.loc['E1':f'E{n_eigs_heatmap}'].to_numpy()))
if norm == 'sqrt':
eigvecs.loc[:, 'E1':f'E{n_eigs_heatmap}'] *= sqrt_lam[np.newaxis, :]
eigvecs = eigvecs[eigvecs['chrom'].isin(chromosomes)].copy()
eigvals = pd.read_parquet(args.eigvals).set_index("eig")["val"]
sqrt_lam = np.sqrt(np.abs(eigvals.loc["E1":f"E{n_eigs_heatmap}"].to_numpy()))
if norm == "sqrt":
eigvecs.loc[:, "E1":f"E{n_eigs_heatmap}"] *= sqrt_lam[np.newaxis, :]
eigvecs = eigvecs[eigvecs["chrom"].isin(chromosomes)].copy()

bins = pd.read_parquet(args.bins)
clusters = pd.read_table(args.cluster)

for clus in n_clusters:
bins["cluster"] = clusters[f'kmeans_sm{clus}']
bins["cluster"] = clusters[f"kmeans_sm{clus}"]
track_db_path = args.track_db
if op.exists(track_db_path):
meta = pd.read_table(args.meta).set_index("Name")
with h5py.File(track_db_path, 'r') as db:
with h5py.File(track_db_path, "r") as db:
for group in config["scatter_groups"].values():
for track_name in group:
if track_name not in bins.columns:
uid = meta["ID"].get(track_name, track_name)
bins[track_name] = db[uid][:]
bins = bins[bins['chrom'].isin(chromosomes)].copy()
bins = bins[bins["chrom"].isin(chromosomes)].copy()

if sort_by == 'centel':
idx = np.lexsort([
bins['centel_abs'].values, bins['cluster'].values
])
if sort_by == "centel":
idx = np.lexsort([bins["centel_abs"].values, bins["cluster"].values])
else:
raise ValueError(sort_by)

plotting.plot_heatmap(
idx,
eigvecs.loc[:, 'E1':f'E{n_eigs_heatmap}'],
eigvecs.loc[:, "E1":f"E{n_eigs_heatmap}"],
bins,
trackconfs= config["tracks"],
trackconfs=config["tracks"],
blocks=config["heatmap_groups"],
coarse_factor=32,
)
plt.savefig(f"{sample}.{binsize}.E1-E{n_eigs}.kmeansm{clus}.heatmap.pdf", bbox_inches='tight')
plt.savefig(
f"{sample}.{binsize}.E1-E{n_eigs}.kmeansm{clus}.heatmap.pdf",
bbox_inches="tight",
)
Loading