Skip to content

Commit

Permalink
Checks for resolution in Hi-C and missing signal fix in lmi.py
Browse files Browse the repository at this point in the history
- prep.py now checks for chromosome nomenclature for both cooler and
hic format. It now also checks that the given resolution for binnarizing
the signal is in the Hi-C file.
- lmi.py now checks if the signals you are computing are in the signals
previously binnarized with prep.py. If not, warns the user and tells
which signals are missing.
- Added fix to missing data in signal. Until now, it was being converted
to 0 and the missing signals were clustered at the end of the region.
Now, the missing signals are equal to the median of the signal of that
region.
  • Loading branch information
Leo Zuber committed Jul 18, 2023
1 parent 1370b13 commit 13d1783
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 35 deletions.
66 changes: 49 additions & 17 deletions metaloci/misc/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def check_diagonal(diagonal: np.ndarray) -> tuple[int, float, float, list]:
return total, percentage_stretch, percentage_zeroes, zero_loc


def clean_matrix(mlobject: mlo.MetalociObject, bad_regions: pd.DataFrame) -> np.ndarray:
def clean_matrix(mlobject: mlo.MetalociObject) -> np.ndarray:
"""
Clean a given HiC matrix. It checks if the matrix has too many zeroes at
he diagonal, removes values that are zero at the diagonal but are not in
Expand All @@ -94,8 +94,6 @@ def clean_matrix(mlobject: mlo.MetalociObject, bad_regions: pd.DataFrame) -> np.
----------
mlo : np.ndarray
METALoci object with a matrix in it.
bad_regions : dict
Dictionay {"region": [], "reason": []} in which to append bad regions.
Returns
-------
Expand Down Expand Up @@ -169,7 +167,9 @@ def signal_normalization(region_signal: pd.DataFrame, pseudocounts: float = None

if pseudocounts is None:

signal = [0.0 if np.isnan(index) else index for index in region_signal]
# signal = [0.0 if np.isnan(index) else index for index in region_signal]
median_default = np.nanmedian(region_signal) # jfk fix
signal = [median_default if np.isnan(index) else index for index in region_signal] # jfk fix

else:

Expand All @@ -194,7 +194,7 @@ def signal_normalization(region_signal: pd.DataFrame, pseudocounts: float = None
return np.array(signal)


def check_chromosome_names(hic_file: Path, data: Path, coords: bool):
def check_cooler_names(hic_file: Path, data: Path, coords: bool):

with open(data[0], "r") as handler:

Expand All @@ -206,27 +206,60 @@ def check_chromosome_names(hic_file: Path, data: Path, coords: bool):

signal_chr_nom = "N"

try:
hic_file = cooler.Cooler(hic_file)

hic_file = cooler.Cooler(hic_file)
if "chr" in [hic_file.chromnames][0][0]:

if "chr" in [hic_file.chromnames][0][0]:
cooler_chr_nom = "chrN"

cooler_chr_nom = "chrN"
else:

cooler_chr_nom = "N"

del hic_file

with open(coords, "r") as handler:

if [line.strip() for line in handler][1].startswith("chr"):

coords_chr_nom = "chrN"

else:

cooler_chr_nom = "N"
coords_chr_nom = "N"

if not signal_chr_nom == cooler_chr_nom == coords_chr_nom:

exit(
"\nThe signal, cooler and chromosome sizes files do not have the same nomenclature for chromosomes:\n"
f"\n\tSignal chromosomes nomenclature is '{signal_chr_nom}'. "
f"\n\tHi-C chromosomes nomenclature is '{cooler_chr_nom}'. "
f"\n\tChromosome sizes nomenclature is '{coords_chr_nom}'. "
"\n\nPlease, rename the chromosome names. "
"\nYou may want to rename the chromosome names in a cooler file with cooler.rename_chroms() in python. "
"\n\nExiting..."
)


except OSError: #CHECK EXCEPTION
def check_hic_names(hic_file: Path, data: Path, coords: bool):

if "chr" in hicstraw.HiCFile(hic_file).getChromosomes()[1].name:
with open(data[0], "r") as handler:

if [line.strip() for line in handler][1].startswith("chr"):

cooler_chr_nom = "chrN"
signal_chr_nom = "chrN"

else:

cooler_chr_nom = "N"
signal_chr_nom = "N"

if "chr" in hicstraw.HiCFile(hic_file).getChromosomes()[1].name:

hic_chr_nom = "chrN"

else:

hic_chr_nom = "N"

del hic_file

Expand All @@ -240,14 +273,13 @@ def check_chromosome_names(hic_file: Path, data: Path, coords: bool):

coords_chr_nom = "N"

if not signal_chr_nom == cooler_chr_nom == coords_chr_nom:
if not signal_chr_nom == hic_chr_nom == coords_chr_nom:

exit(
"\nThe signal, cooler and chromosome sizes files do not have the same nomenclature for chromosomes:\n"
f"\n\tSignal chromosomes nomenclature is '{signal_chr_nom}'. "
f"\n\tHi-C chromosomes nomenclature is '{cooler_chr_nom}'. "
f"\n\tHi-C chromosomes nomenclature is '{hic_chr_nom}'. "
f"\n\tChromosome sizes nomenclature is '{coords_chr_nom}'. "
"\n\nPlease, rename the chromosome names. "
"\nYou may want to rename the chromosome names in a cooler file with cooler.rename_chroms() in python. "
"\n\nExiting..."
)
23 changes: 18 additions & 5 deletions metaloci/spatial_stats/lmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,24 +201,37 @@ def load_region_signals(mlobject: mlo.MetalociObject, signal_data: dict, signal_

signal_types = [line.rstrip() for line in signals_handler]

# region_signal = signal_data[mlobject.chrom][
# (signal_data[mlobject.chrom]["start"] >= int(mlobject.start / mlobject.resolution) * mlobject.resolution)
# & (signal_data[mlobject.chrom]["end"] <= int(mlobject.end / mlobject.resolution) * mlobject.resolution)
# ]

region_signal = signal_data[mlobject.chrom][
(signal_data[mlobject.chrom]["start"] >= int(mlobject.start / mlobject.resolution) * mlobject.resolution)
& (signal_data[mlobject.chrom]["end"] <= int(mlobject.end / mlobject.resolution) * mlobject.resolution)
]
(signal_data[mlobject.chrom]["start"] >= int(np.floor(mlobject.start / mlobject.resolution)) * mlobject.resolution)
& (signal_data[mlobject.chrom]["end"] <= int(np.ceil(mlobject.end / mlobject.resolution)) * mlobject.resolution)
] # jfm: fix

if len(region_signal) != len(mlobject.kk_coords):

tmp = len(mlobject.kk_coords) - len(region_signal)
tmp = np.empty((tmp, len(region_signal.columns)))
tmp[:] = 0
# tmp[:] = 0
tmp[:] = np.nan # jfm: leave NAs in for now. (in misc.signal_normalization() those NaNS will be substituted for
# the median of the signal of the region)

region_signal = pd.concat([region_signal, pd.DataFrame(tmp, columns=list(region_signal))], ignore_index=True)

signals_dict = defaultdict(list)

for signal_type in signal_types:

signals_dict[signal_type] = misc.signal_normalization(region_signal[signal_type])
try:

signals_dict[signal_type] = misc.signal_normalization(region_signal[signal_type])

except KeyError:

return None, signal_type

return signals_dict, signal_types

Expand Down
39 changes: 28 additions & 11 deletions metaloci/tools/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,21 @@ def get_lmi(region_iter, opts, signal_data, progress=None, i=None, silent: bool
if mlobject.kk_nodes is None:

if silent == False:

print("Kamada-Kawai layout has not been calculated for this region. \n\tSkipping to next region...")
print("\tKamada-Kawai layout has not been calculated for this region. \n\tSkipping to next region...")

return

# Load only signal for this specific region.
mlobject.signals_dict, signal_types = lmi.load_region_signals(mlobject, signal_data, signals)

# If the signal is not present in the signals folder (has not been processed with prep) but it is in
# the list of signal to process, raise an exception and print which signals need to be processed.
if mlobject.signals_dict == None and progress is not None:

progress["missing_signal"] = signal_types

raise Exception()

# This checks if every signal you want to process is already computed. If the user works with a few signals
# but decides to add some more later, she can use the same working directory and KK, and the LMI will be
# computed again with the new signals. If everything is already computed, does nothing.
Expand All @@ -173,7 +180,6 @@ def get_lmi(region_iter, opts, signal_data, progress=None, i=None, silent: bool
if [signal for signal, _ in mlobject.lmi_info.items()] == signal_types:

if silent == False:

print("\tLMI already computed for this region. \n\tSkipping to next region...")

if progress is not None: progress["done"] = True
Expand All @@ -188,7 +194,6 @@ def get_lmi(region_iter, opts, signal_data, progress=None, i=None, silent: bool
df.to_csv(f"{moran_log_path}/{mlobject.region}_{mlobject.poi}_{signal}.txt", sep="\t", index=False)

if silent == False:

print(f"\tLog saved to: {moran_log_path}/{mlobject.region}_{mlobject.poi}_{signal}.txt")

return
Expand Down Expand Up @@ -288,19 +293,31 @@ def run(opts):
try:

manager = mp.Manager()
progress = manager.dict(value=0, timer = start_timer, done = False)
progress = manager.dict(value=0, timer = start_timer, done = False, missing_signal = None)

with mp.Pool(processes=cores) as pool:

pool.starmap(get_lmi, [(row, opts, signal_data, progress) for _, row in df_regions.iterrows()])
try:

if progress["done"] == True:
pool.starmap(get_lmi, [(row, opts, signal_data, progress) for _, row in df_regions.iterrows()])

print("\tSome regions had already been computed and have been skipped.", end="")

if moran_log:
if progress["done"] == True:

print("\tSome regions had already been computed and have been skipped.", end="")

if moran_log:

print(f"\n\tLog saved to: {work_dir}chrN/moran_log", end="")

print(f"\n\tLog saved to: {work_dir}chrN/moran_log", end="")
except Exception:

if progress["missing_signal"] is not None:

print(f"\tSignal {progress['missing_signal']} is in the signal list but has not been processed with prep.\n"
"\tprocess that signal or remove it from the signal list.\nExiting...")

pool.close()
pool.join()

pool.close()
pool.join()
Expand Down
17 changes: 15 additions & 2 deletions metaloci/tools/prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from metaloci.misc import misc
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
import warnings
import h5py
import hicstraw

warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)
Expand Down Expand Up @@ -91,7 +93,6 @@ def populate_args(parser):

optional_arg.add_argument("-h", "--help", action="help", help="Show this help message and exit.")


def run(opts):

if not opts.work_dir.endswith("/"):
Expand All @@ -108,10 +109,22 @@ def run(opts):
pathlib.Path(tmp_dir).mkdir(parents=True, exist_ok=True)

if hic_path.endswith(".cool") or hic_path.endswith(".mcool"):

if resolution not in sorted([int(x) for x in list(h5py.File(hic_path)["resolutions"].keys())]):

exit("The given resolution is not in the provided cooler file. Exiting...")

hic_path = hic_path + "::/resolutions/" + str(resolution)

misc.check_chromosome_names(hic_path, data, coords)
misc.check_cooler_names(hic_path, data, coords)

elif hic_path.endswith(".hic"):

if resolution not in hicstraw.HiCFile(hic_path).getResolutions():

exit("The given resolution is not in the provided Hi-C file. Exiting...")

misc.check_hic_names(hic_path, data, coords)

column_dict = {}

Expand Down

0 comments on commit 13d1783

Please sign in to comment.