diff --git a/metaloci/misc/misc.py b/metaloci/misc/misc.py index ad5cbf7..ee04d39 100644 --- a/metaloci/misc/misc.py +++ b/metaloci/misc/misc.py @@ -82,7 +82,7 @@ def check_diagonal(diagonal: np.ndarray) -> tuple[int, float, float, list]: return total, percentage_stretch, percentage_zeroes, zero_loc -def clean_matrix(mlobject: mlo.MetalociObject, bad_regions: pd.DataFrame) -> np.ndarray: +def clean_matrix(mlobject: mlo.MetalociObject) -> np.ndarray: """ Clean a given HiC matrix. It checks if the matrix has too many zeroes at he diagonal, removes values that are zero at the diagonal but are not in @@ -94,8 +94,6 @@ def clean_matrix(mlobject: mlo.MetalociObject, bad_regions: pd.DataFrame) -> np. ---------- mlo : np.ndarray METALoci object with a matrix in it. - bad_regions : dict - Dictionay {"region": [], "reason": []} in which to append bad regions. Returns ------- @@ -169,7 +167,9 @@ def signal_normalization(region_signal: pd.DataFrame, pseudocounts: float = None if pseudocounts is None: - signal = [0.0 if np.isnan(index) else index for index in region_signal] + # signal = [0.0 if np.isnan(index) else index for index in region_signal] + median_default = np.nanmedian(region_signal) # jfk fix + signal = [median_default if np.isnan(index) else index for index in region_signal] # jfk fix else: @@ -194,7 +194,7 @@ def signal_normalization(region_signal: pd.DataFrame, pseudocounts: float = None return np.array(signal) -def check_chromosome_names(hic_file: Path, data: Path, coords: bool): +def check_cooler_names(hic_file: Path, data: Path, coords: bool): with open(data[0], "r") as handler: @@ -206,27 +206,60 @@ def check_chromosome_names(hic_file: Path, data: Path, coords: bool): signal_chr_nom = "N" - try: + hic_file = cooler.Cooler(hic_file) - hic_file = cooler.Cooler(hic_file) + if "chr" in [hic_file.chromnames][0][0]: - if "chr" in [hic_file.chromnames][0][0]: + cooler_chr_nom = "chrN" - cooler_chr_nom = "chrN" + else: + + cooler_chr_nom = "N" + + del hic_file + + with open(coords, "r") as handler: + + if [line.strip() for line in handler][1].startswith("chr"): + + coords_chr_nom = "chrN" else: - cooler_chr_nom = "N" + coords_chr_nom = "N" + + if not signal_chr_nom == cooler_chr_nom == coords_chr_nom: + + exit( + "\nThe signal, cooler and chromosome sizes files do not have the same nomenclature for chromosomes:\n" + f"\n\tSignal chromosomes nomenclature is '{signal_chr_nom}'. " + f"\n\tHi-C chromosomes nomenclature is '{cooler_chr_nom}'. " + f"\n\tChromosome sizes nomenclature is '{coords_chr_nom}'. " + "\n\nPlease, rename the chromosome names. " + "\nYou may want to rename the chromosome names in a cooler file with cooler.rename_chroms() in python. " + "\n\nExiting..." + ) + - except OSError: #CHECK EXCEPTION +def check_hic_names(hic_file: Path, data: Path, coords: bool): - if "chr" in hicstraw.HiCFile(hic_file).getChromosomes()[1].name: + with open(data[0], "r") as handler: + + if [line.strip() for line in handler][1].startswith("chr"): - cooler_chr_nom = "chrN" + signal_chr_nom = "chrN" else: - cooler_chr_nom = "N" + signal_chr_nom = "N" + + if "chr" in hicstraw.HiCFile(hic_file).getChromosomes()[1].name: + + hic_chr_nom = "chrN" + + else: + + hic_chr_nom = "N" del hic_file @@ -240,14 +273,13 @@ def check_chromosome_names(hic_file: Path, data: Path, coords: bool): coords_chr_nom = "N" - if not signal_chr_nom == cooler_chr_nom == coords_chr_nom: + if not signal_chr_nom == hic_chr_nom == coords_chr_nom: exit( "\nThe signal, cooler and chromosome sizes files do not have the same nomenclature for chromosomes:\n" f"\n\tSignal chromosomes nomenclature is '{signal_chr_nom}'. " - f"\n\tHi-C chromosomes nomenclature is '{cooler_chr_nom}'. " + f"\n\tHi-C chromosomes nomenclature is '{hic_chr_nom}'. " f"\n\tChromosome sizes nomenclature is '{coords_chr_nom}'. " "\n\nPlease, rename the chromosome names. " - "\nYou may want to rename the chromosome names in a cooler file with cooler.rename_chroms() in python. " "\n\nExiting..." ) diff --git a/metaloci/spatial_stats/lmi.py b/metaloci/spatial_stats/lmi.py index 4744002..68f8dae 100644 --- a/metaloci/spatial_stats/lmi.py +++ b/metaloci/spatial_stats/lmi.py @@ -201,16 +201,23 @@ def load_region_signals(mlobject: mlo.MetalociObject, signal_data: dict, signal_ signal_types = [line.rstrip() for line in signals_handler] + # region_signal = signal_data[mlobject.chrom][ + # (signal_data[mlobject.chrom]["start"] >= int(mlobject.start / mlobject.resolution) * mlobject.resolution) + # & (signal_data[mlobject.chrom]["end"] <= int(mlobject.end / mlobject.resolution) * mlobject.resolution) + # ] + region_signal = signal_data[mlobject.chrom][ - (signal_data[mlobject.chrom]["start"] >= int(mlobject.start / mlobject.resolution) * mlobject.resolution) - & (signal_data[mlobject.chrom]["end"] <= int(mlobject.end / mlobject.resolution) * mlobject.resolution) - ] + (signal_data[mlobject.chrom]["start"] >= int(np.floor(mlobject.start / mlobject.resolution)) * mlobject.resolution) + & (signal_data[mlobject.chrom]["end"] <= int(np.ceil(mlobject.end / mlobject.resolution)) * mlobject.resolution) + ] # jfm: fix if len(region_signal) != len(mlobject.kk_coords): tmp = len(mlobject.kk_coords) - len(region_signal) tmp = np.empty((tmp, len(region_signal.columns))) - tmp[:] = 0 + # tmp[:] = 0 + tmp[:] = np.nan # jfm: leave NAs in for now. (in misc.signal_normalization() those NaNS will be substituted for + # the median of the signal of the region) region_signal = pd.concat([region_signal, pd.DataFrame(tmp, columns=list(region_signal))], ignore_index=True) @@ -218,7 +225,13 @@ def load_region_signals(mlobject: mlo.MetalociObject, signal_data: dict, signal_ for signal_type in signal_types: - signals_dict[signal_type] = misc.signal_normalization(region_signal[signal_type]) + try: + + signals_dict[signal_type] = misc.signal_normalization(region_signal[signal_type]) + + except KeyError: + + return None, signal_type return signals_dict, signal_types diff --git a/metaloci/tools/ml.py b/metaloci/tools/ml.py index d640d85..5b070a5 100644 --- a/metaloci/tools/ml.py +++ b/metaloci/tools/ml.py @@ -156,14 +156,21 @@ def get_lmi(region_iter, opts, signal_data, progress=None, i=None, silent: bool if mlobject.kk_nodes is None: if silent == False: - - print("Kamada-Kawai layout has not been calculated for this region. \n\tSkipping to next region...") + print("\tKamada-Kawai layout has not been calculated for this region. \n\tSkipping to next region...") return # Load only signal for this specific region. mlobject.signals_dict, signal_types = lmi.load_region_signals(mlobject, signal_data, signals) + # If the signal is not present in the signals folder (has not been processed with prep) but it is in + # the list of signal to process, raise an exception and print which signals need to be processed. + if mlobject.signals_dict == None and progress is not None: + + progress["missing_signal"] = signal_types + + raise Exception() + # This checks if every signal you want to process is already computed. If the user works with a few signals # but decides to add some more later, she can use the same working directory and KK, and the LMI will be # computed again with the new signals. If everything is already computed, does nothing. @@ -173,7 +180,6 @@ def get_lmi(region_iter, opts, signal_data, progress=None, i=None, silent: bool if [signal for signal, _ in mlobject.lmi_info.items()] == signal_types: if silent == False: - print("\tLMI already computed for this region. \n\tSkipping to next region...") if progress is not None: progress["done"] = True @@ -188,7 +194,6 @@ def get_lmi(region_iter, opts, signal_data, progress=None, i=None, silent: bool df.to_csv(f"{moran_log_path}/{mlobject.region}_{mlobject.poi}_{signal}.txt", sep="\t", index=False) if silent == False: - print(f"\tLog saved to: {moran_log_path}/{mlobject.region}_{mlobject.poi}_{signal}.txt") return @@ -288,19 +293,31 @@ def run(opts): try: manager = mp.Manager() - progress = manager.dict(value=0, timer = start_timer, done = False) + progress = manager.dict(value=0, timer = start_timer, done = False, missing_signal = None) with mp.Pool(processes=cores) as pool: - pool.starmap(get_lmi, [(row, opts, signal_data, progress) for _, row in df_regions.iterrows()]) + try: - if progress["done"] == True: + pool.starmap(get_lmi, [(row, opts, signal_data, progress) for _, row in df_regions.iterrows()]) - print("\tSome regions had already been computed and have been skipped.", end="") - - if moran_log: + if progress["done"] == True: + + print("\tSome regions had already been computed and have been skipped.", end="") + + if moran_log: + + print(f"\n\tLog saved to: {work_dir}chrN/moran_log", end="") - print(f"\n\tLog saved to: {work_dir}chrN/moran_log", end="") + except Exception: + + if progress["missing_signal"] is not None: + + print(f"\tSignal {progress['missing_signal']} is in the signal list but has not been processed with prep.\n" + "\tprocess that signal or remove it from the signal list.\nExiting...") + + pool.close() + pool.join() pool.close() pool.join() diff --git a/metaloci/tools/prep.py b/metaloci/tools/prep.py index 014b2a0..e379296 100644 --- a/metaloci/tools/prep.py +++ b/metaloci/tools/prep.py @@ -13,6 +13,8 @@ from metaloci.misc import misc from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning import warnings +import h5py +import hicstraw warnings.simplefilter('ignore', category=NumbaDeprecationWarning) warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning) @@ -91,7 +93,6 @@ def populate_args(parser): optional_arg.add_argument("-h", "--help", action="help", help="Show this help message and exit.") - def run(opts): if not opts.work_dir.endswith("/"): @@ -108,10 +109,22 @@ def run(opts): pathlib.Path(tmp_dir).mkdir(parents=True, exist_ok=True) if hic_path.endswith(".cool") or hic_path.endswith(".mcool"): + + if resolution not in sorted([int(x) for x in list(h5py.File(hic_path)["resolutions"].keys())]): + + exit("The given resolution is not in the provided cooler file. Exiting...") hic_path = hic_path + "::/resolutions/" + str(resolution) - misc.check_chromosome_names(hic_path, data, coords) + misc.check_cooler_names(hic_path, data, coords) + + elif hic_path.endswith(".hic"): + + if resolution not in hicstraw.HiCFile(hic_path).getResolutions(): + + exit("The given resolution is not in the provided Hi-C file. Exiting...") + + misc.check_hic_names(hic_path, data, coords) column_dict = {}