From 7b7fddf2261e28f64e14fe3107f5583ac6a3fc23 Mon Sep 17 00:00:00 2001 From: "yann.roussel" Date: Tue, 10 Dec 2024 16:01:04 +0100 Subject: [PATCH] README.rst update for met-types generation --- README.rst | 12 ++ .../convert_t_type_nrrds_to_me_type_nrrds.py | 183 +++++++++++------- atlas_densities/app/process_t_types_job.sh | 14 +- 3 files changed, 137 insertions(+), 72 deletions(-) diff --git a/README.rst b/README.rst index 0224a25..f92265e 100644 --- a/README.rst +++ b/README.rst @@ -392,6 +392,18 @@ The command outputs the density files in the output-dir and a metadata json file // ... } +Compute MET-types densities update +------------------------------------------------- + +An update to the probabilistic map allows to estimate Morphological and Electrical type densities of inhibitory +and excitatory neurons in the whole brain using `Verasztó et al. (202X)`_'s pipeline. +This pipeline produces transcriptomic types (t-types) densities and a mapping from T-types to canonical neuronal +ME-types as defined in the paper. + +.. code-block:: bash + + sbatch atlas_densities/app/process_t_types_job.sh + Subdivide excitatory files into pyramidal subtypes -------------------------------------------------- diff --git a/atlas_densities/app/convert_t_type_nrrds_to_me_type_nrrds.py b/atlas_densities/app/convert_t_type_nrrds_to_me_type_nrrds.py index bd4ce8e..b2bd797 100644 --- a/atlas_densities/app/convert_t_type_nrrds_to_me_type_nrrds.py +++ b/atlas_densities/app/convert_t_type_nrrds_to_me_type_nrrds.py @@ -1,6 +1,7 @@ import argparse +import glob import os -from concurrent.futures import ProcessPoolExecutor, as_completed +from concurrent.futures import as_completed, ProcessPoolExecutor from tqdm import tqdm import pandas as pd import nrrd @@ -12,83 +13,114 @@ def load_p_map(path_to_p_map, max_me_types=None): """ - Load the P-map and format it with MET_TYPE keys (ME_TYPE|T_TYPE). - - Args: - path_to_p_map (str): Path to the P-map CSV file. - max_me_types (int, optional): Limit the number of ME-types. - - Returns: - pd.DataFrame: P-map reformatted with MET_TYPE keys as columns. + Load and preprocess the P-map, filtering out invalid T-type and M-type combinations + based on the provided rules: + - T-types containing 'Glut' can only combine with M-types containing 'PC'. + - T-types containing 'Gaba' can only combine with M-types containing 'IN'. """ - # Load the P-map as a DataFrame + # Load the P-map p_map = pd.read_csv(path_to_p_map, index_col=0) + print(f"Loaded P-map shape: {p_map.shape}") - # Normalize rows to ensure probabilities sum to 1 - p_map = p_map.div(p_map.sum(axis=1), axis=0) - - # Create a new DataFrame with MET_TYPE keys (ME_TYPE|T_TYPE) - met_types_df = pd.DataFrame() - - for t_type in p_map.index: # Iterate over T-types (rows) - renamed_row = p_map.loc[t_type].rename(lambda me_type: f"{me_type}|{t_type}") # Create MET_TYPE keys - met_types_df = pd.concat([met_types_df, renamed_row.to_frame().T], axis=1) # Append renamed row - - # Limit the number of ME-types if specified - if max_me_types is not None: - met_types_df = met_types_df.iloc[:, :max_me_types] - - return met_types_df + # Fill NaNs with zeros + p_map = p_map.fillna(0) + print(f"Shape after filling NaNs: {p_map.shape}") + + # Apply filtering rules + def is_unvalid_combination(t_type, m_type): + if "Glut" in t_type and "IN" in m_type: + return True + if "Gaba" in t_type and "PC" in m_type: + return True + return False + + # Filter out invalid combinations + for col in p_map.columns: + for t_type in p_map.index: + if is_unvalid_combination(t_type, col): + p_map.at[t_type, col] = 0.0 + + # Normalize rows + row_sums = p_map.sum(axis=1) + print(f"Row sums before normalization: {row_sums.describe()}") + p_map = p_map.div(row_sums, axis=0).fillna(0) # Normalize and handle NaNs + print(f"Shape after normalization: {p_map.shape}") + + # Create MET keys + stacked_p_map = p_map.stack() + print(f"Stacked P-map shape: {stacked_p_map.shape}") + met_types = [f"{me_type}|{t_type}" for t_type, me_type in stacked_p_map.index] + met_df = pd.DataFrame([stacked_p_map.values], columns=met_types) + print(f"Initial MET DataFrame shape: {met_df.shape}") + + # Filter out MET columns with zero values + met_df = met_df.loc[:, (met_df != 0).any(axis=0)] + print(f"Filtered MET DataFrame shape: {met_df.shape}") + + return met_df -# Initialize output directory def setup_output_path(output_path): + """Create output directory if it doesn't exist.""" if not os.path.exists(output_path): os.makedirs(output_path) -# Process a single T-type -def process_t_type(t_type, t_type_path, p_map_subset, sizes): +def process_met_type(met_type, met_data, t_type_paths, sizes): + """Compute NRRD data for a single MET-type.""" try: - t_type_data, _ = nrrd.read(t_type_path) + t_type = met_type.split("|")[2].replace("-","_") # Extract T-type from MET-type + # Check if T-type exists in t_type_paths + if t_type not in t_type_paths: + print(f"Skipping MET-type {met_type}: T-type {t_type} not found in t_type_paths.") + return None + t_type_data, _ = nrrd.read(t_type_paths[t_type]) if t_type_data.shape != tuple(sizes): raise ValueError(f"Incorrect dimensions for {t_type}: {t_type_data.shape}") - - me_type_results = {} - for me_type, probability in p_map_subset.iteritems(): - me_type_results[me_type] = t_type_data * probability - - return me_type_results + return t_type_data * met_data.values except Exception as e: - print(f"Error processing T-type {t_type}: {e}") + print(f"Error processing MET-type {met_type}: {e}") return None -# Save ME-type results -def save_me_type_results(me_type_accum, output_path): - for me_type, data in me_type_accum.items(): - nrrd.write(os.path.join(output_path, f"{me_type}.nrrd"), data) +def save_met_type_result(met_type, data, output_path): + """Save NRRD file for a single MET-type.""" + try: + file_path = os.path.join(output_path, f"{met_type}.nrrd") + nrrd.write(file_path, data) + except Exception as e: + print(f"Error saving MET-type {met_type}: {e}") + +def process_batch(met_types_batch, p_map_subset, t_type_paths, output_path, sizes): + """Process a batch of MET-types.""" -# Process T-types in a batch -def process_batch(t_type_list, t_type_paths, p_map, output_path, header): - with tqdm(total=len(t_type_list), desc="Processing T-types", position=0) as progress: - for t_type in t_type_list: - try: - p_map_subset = p_map.loc[t_type] - me_type_results = process_t_type(t_type, t_type_paths[t_type], p_map_subset, header['sizes']) - if me_type_results: - save_me_type_results(me_type_results, output_path) - except Exception as e: - print(f"Error processing T-type {t_type}: {e}") - progress.update(1) + for met_type in tqdm(met_types_batch, desc="Processing MET-types"): + try: + data = process_met_type(met_type, p_map_subset[met_type], t_type_paths, sizes) + if data is not None: + save_met_type_result(met_type, data, output_path) + except Exception as e: + print(f"Error processing batch for {met_type}: {e}") -# Main function def main(): - parser = argparse.ArgumentParser(description="Process T-types for ME-types generation.") - parser.add_argument("--start", type=int, required=True, help="Start index of T-types to process") - parser.add_argument("--end", type=int, required=True, help="End index of T-types to process") + parser = argparse.ArgumentParser() + parser.add_argument("--start", type=int, required=True, help="Start index of T-types") + parser.add_argument("--end", type=int, required=True, help="End index of T-types") + parser.add_argument( + "--workers", type=int, default=1, help="Number of worker processes to use" + ) args = parser.parse_args() - # Load p_map and setup output - # p_map = load_p_map(PATH_TO_P_MAP, max_me_types=50) - p_map = load_p_map(PATH_TO_P_MAP, max_me_types=None) + print(f"Processing T-types from index {args.start} to {args.end}...") + + # Ensure indices are valid + t_type_files = sorted(glob.glob(f"{PATH_TO_T_TYPES_NRRDS}/*.nrrd")) + if args.start < 0 or args.end >= len(t_type_files): + raise ValueError(f"Invalid range: {args.start} to {args.end}") + + t_type_batch = t_type_files[args.start:args.end + 1] + print(f"Loaded {len(t_type_batch)} T-types for processing.") + t_type_batch_list = [f.split("/")[-1].replace(".nrrd", "") for f in t_type_batch] + + # Load P-map + p_map = load_p_map(PATH_TO_P_MAP) setup_output_path(OUTPUT_PATH) # Load T-type paths @@ -96,18 +128,33 @@ def main(): t_type_list = sorted([f.replace(".nrrd", "") for f in t_type_files]) t_type_paths = {t_type: os.path.join(PATH_TO_T_TYPES_NRRDS, f"{t_type}.nrrd") for t_type in t_type_list} - # Filter T-types to process in this batch - batch_t_types = t_type_list[args.start:args.end] - if not batch_t_types: - print(f"No T-types to process in range {args.start}-{args.end}.") + # Get MET-types and filter range + met_types = list(p_map.columns) + print("all met-types :", len(met_types)) + print(t_type_batch_list[:10]) + print(met_types[:10]) + met_types_batch = [met_type for met_type in met_types if met_type.split("|")[2].replace("-","_") in t_type_batch_list] #met_types[args.start:args.end] + print("batch size :", len(met_types_batch)) + if not met_types_batch: + print(f"No MET-types to process in range {args.start}-{args.end}.") return - # Get header from the first T-type - init_nrrd, header = nrrd.read(t_type_paths[batch_t_types[0]]) + # Get header from a T-type + init_nrrd, header = nrrd.read(t_type_paths[t_type_list[0]]) - # Process the batch - process_batch(batch_t_types, t_type_paths, p_map, OUTPUT_PATH, header) + # Process MET-types in parallel + batch_size = max(1, len(met_types_batch) // args.workers) + with ProcessPoolExecutor(max_workers=args.workers) as executor: + futures = [] + for i in range(0, len(met_types_batch), batch_size): + batch = met_types_batch[i:i + batch_size] + futures.append(executor.submit(process_batch, batch, p_map, t_type_paths, OUTPUT_PATH, header["sizes"])) + + for future in as_completed(futures): + try: + future.result() + except Exception as e: + print(f"Error in parallel execution: {e}") if __name__ == "__main__": main() - diff --git a/atlas_densities/app/process_t_types_job.sh b/atlas_densities/app/process_t_types_job.sh index 651a810..a658554 100644 --- a/atlas_densities/app/process_t_types_job.sh +++ b/atlas_densities/app/process_t_types_job.sh @@ -30,11 +30,17 @@ fi # Calculate start and end indices for this job START=$((SLURM_ARRAY_TASK_ID * BATCH_SIZE)) -END=$((START + BATCH_SIZE)) -if [ $END -gt $TOTAL_T_TYPES ]; then - END=$TOTAL_T_TYPES +END=$((START + BATCH_SIZE - 1)) +if [ $END -ge $TOTAL_T_TYPES ]; then + END=$((TOTAL_T_TYPES - 1)) # Adjust for the last batch +fi + +# Ensure there is at least one T-type to process +if [ $START -gt $END ]; then + echo "No T-types to process for task ID $SLURM_ARRAY_TASK_ID." + exit 0 fi # Run the Python script echo "Processing T-types from index $START to $END..." -python convert_t_type_nrrds_to_me_type_nrrds.py --start $START --end $END \ No newline at end of file +python convert_t_type_nrrds_to_me_type_nrrds.py --start $START --end $END --workers 4 \ No newline at end of file