From 7b7fddf2261e28f64e14fe3107f5583ac6a3fc23 Mon Sep 17 00:00:00 2001
From: "yann.roussel" <yann.roussel@epfl.ch>
Date: Tue, 10 Dec 2024 16:01:04 +0100
Subject: [PATCH] README.rst update for met-types generation

---
 README.rst                                    |  12 ++
 .../convert_t_type_nrrds_to_me_type_nrrds.py  | 183 +++++++++++-------
 atlas_densities/app/process_t_types_job.sh    |  14 +-
 3 files changed, 137 insertions(+), 72 deletions(-)

diff --git a/README.rst b/README.rst
index 0224a25..f92265e 100644
--- a/README.rst
+++ b/README.rst
@@ -392,6 +392,18 @@ The command outputs the density files in the output-dir and a metadata json file
         // ...
     }
 
+Compute MET-types densities update
+-------------------------------------------------
+
+An update to the probabilistic map allows to estimate Morphological and Electrical type densities of inhibitory 
+and excitatory neurons in the whole brain using `Verasztó et al. (202X)`_'s pipeline. 
+This pipeline produces transcriptomic types (t-types) densities and a mapping from T-types to canonical neuronal 
+ME-types as defined in the paper.
+
+.. code-block:: bash
+
+    sbatch atlas_densities/app/process_t_types_job.sh         
+
 Subdivide excitatory files into pyramidal subtypes
 --------------------------------------------------
 
diff --git a/atlas_densities/app/convert_t_type_nrrds_to_me_type_nrrds.py b/atlas_densities/app/convert_t_type_nrrds_to_me_type_nrrds.py
index bd4ce8e..b2bd797 100644
--- a/atlas_densities/app/convert_t_type_nrrds_to_me_type_nrrds.py
+++ b/atlas_densities/app/convert_t_type_nrrds_to_me_type_nrrds.py
@@ -1,6 +1,7 @@
 import argparse
+import glob
 import os
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from concurrent.futures import as_completed, ProcessPoolExecutor
 from tqdm import tqdm
 import pandas as pd
 import nrrd
@@ -12,83 +13,114 @@
 
 def load_p_map(path_to_p_map, max_me_types=None):
     """
-    Load the P-map and format it with MET_TYPE keys (ME_TYPE|T_TYPE).
-
-    Args:
-        path_to_p_map (str): Path to the P-map CSV file.
-        max_me_types (int, optional): Limit the number of ME-types.
-
-    Returns:
-        pd.DataFrame: P-map reformatted with MET_TYPE keys as columns.
+    Load and preprocess the P-map, filtering out invalid T-type and M-type combinations
+    based on the provided rules:
+    - T-types containing 'Glut' can only combine with M-types containing 'PC'.
+    - T-types containing 'Gaba' can only combine with M-types containing 'IN'.
     """
-    # Load the P-map as a DataFrame
+    # Load the P-map
     p_map = pd.read_csv(path_to_p_map, index_col=0)
+    print(f"Loaded P-map shape: {p_map.shape}")
     
-    # Normalize rows to ensure probabilities sum to 1
-    p_map = p_map.div(p_map.sum(axis=1), axis=0)
-
-    # Create a new DataFrame with MET_TYPE keys (ME_TYPE|T_TYPE)
-    met_types_df = pd.DataFrame()
-
-    for t_type in p_map.index:  # Iterate over T-types (rows)
-        renamed_row = p_map.loc[t_type].rename(lambda me_type: f"{me_type}|{t_type}")  # Create MET_TYPE keys
-        met_types_df = pd.concat([met_types_df, renamed_row.to_frame().T], axis=1)  # Append renamed row
-
-    # Limit the number of ME-types if specified
-    if max_me_types is not None:
-        met_types_df = met_types_df.iloc[:, :max_me_types]
-
-    return met_types_df
+    # Fill NaNs with zeros
+    p_map = p_map.fillna(0)
+    print(f"Shape after filling NaNs: {p_map.shape}")
+    
+    # Apply filtering rules
+    def is_unvalid_combination(t_type, m_type):
+        if "Glut" in t_type and "IN" in m_type:
+            return True
+        if "Gaba" in t_type and "PC" in m_type:
+            return True
+        return False
+
+    # Filter out invalid combinations
+    for col in p_map.columns:
+        for t_type in p_map.index:
+            if is_unvalid_combination(t_type, col):
+                p_map.at[t_type, col] = 0.0
+    
+    # Normalize rows
+    row_sums = p_map.sum(axis=1)
+    print(f"Row sums before normalization: {row_sums.describe()}")
+    p_map = p_map.div(row_sums, axis=0).fillna(0)  # Normalize and handle NaNs
+    print(f"Shape after normalization: {p_map.shape}")
+    
+    # Create MET keys
+    stacked_p_map = p_map.stack()
+    print(f"Stacked P-map shape: {stacked_p_map.shape}")
+    met_types = [f"{me_type}|{t_type}" for t_type, me_type in stacked_p_map.index]
+    met_df = pd.DataFrame([stacked_p_map.values], columns=met_types)
+    print(f"Initial MET DataFrame shape: {met_df.shape}")
+
+    # Filter out MET columns with zero values
+    met_df = met_df.loc[:, (met_df != 0).any(axis=0)]
+    print(f"Filtered MET DataFrame shape: {met_df.shape}")
+    
+    return met_df
 
-# Initialize output directory
 def setup_output_path(output_path):
+    """Create output directory if it doesn't exist."""
     if not os.path.exists(output_path):
         os.makedirs(output_path)
 
-# Process a single T-type
-def process_t_type(t_type, t_type_path, p_map_subset, sizes):
+def process_met_type(met_type, met_data, t_type_paths, sizes):
+    """Compute NRRD data for a single MET-type."""
     try:
-        t_type_data, _ = nrrd.read(t_type_path)
+        t_type = met_type.split("|")[2].replace("-","_")  # Extract T-type from MET-type
+        # Check if T-type exists in t_type_paths
+        if t_type not in t_type_paths:
+            print(f"Skipping MET-type {met_type}: T-type {t_type} not found in t_type_paths.")
+            return None
+        t_type_data, _ = nrrd.read(t_type_paths[t_type])
         if t_type_data.shape != tuple(sizes):
             raise ValueError(f"Incorrect dimensions for {t_type}: {t_type_data.shape}")
-        
-        me_type_results = {}
-        for me_type, probability in p_map_subset.iteritems():
-            me_type_results[me_type] = t_type_data * probability
-        
-        return me_type_results
+        return t_type_data * met_data.values
     except Exception as e:
-        print(f"Error processing T-type {t_type}: {e}")
+        print(f"Error processing MET-type {met_type}: {e}")
         return None
 
-# Save ME-type results
-def save_me_type_results(me_type_accum, output_path):
-    for me_type, data in me_type_accum.items():
-        nrrd.write(os.path.join(output_path, f"{me_type}.nrrd"), data)
+def save_met_type_result(met_type, data, output_path):
+    """Save NRRD file for a single MET-type."""
+    try:
+        file_path = os.path.join(output_path, f"{met_type}.nrrd")
+        nrrd.write(file_path, data)
+    except Exception as e:
+        print(f"Error saving MET-type {met_type}: {e}")
+
+def process_batch(met_types_batch, p_map_subset, t_type_paths, output_path, sizes):
+    """Process a batch of MET-types."""
 
-# Process T-types in a batch
-def process_batch(t_type_list, t_type_paths, p_map, output_path, header):
-    with tqdm(total=len(t_type_list), desc="Processing T-types", position=0) as progress:
-        for t_type in t_type_list:
-            try:
-                p_map_subset = p_map.loc[t_type]
-                me_type_results = process_t_type(t_type, t_type_paths[t_type], p_map_subset, header['sizes'])
-                if me_type_results:
-                    save_me_type_results(me_type_results, output_path)
-            except Exception as e:
-                print(f"Error processing T-type {t_type}: {e}")
-            progress.update(1)
+    for met_type in tqdm(met_types_batch, desc="Processing MET-types"):
+        try:
+            data = process_met_type(met_type, p_map_subset[met_type], t_type_paths, sizes)
+            if data is not None:
+                save_met_type_result(met_type, data, output_path)
+        except Exception as e:
+            print(f"Error processing batch for {met_type}: {e}")
 
-# Main function
 def main():
-    parser = argparse.ArgumentParser(description="Process T-types for ME-types generation.")
-    parser.add_argument("--start", type=int, required=True, help="Start index of T-types to process")
-    parser.add_argument("--end", type=int, required=True, help="End index of T-types to process")
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--start", type=int, required=True, help="Start index of T-types")
+    parser.add_argument("--end", type=int, required=True, help="End index of T-types")
+    parser.add_argument(
+        "--workers", type=int, default=1, help="Number of worker processes to use"
+        )
     args = parser.parse_args()
 
-    # Load p_map and setup output
-    # p_map = load_p_map(PATH_TO_P_MAP, max_me_types=50)
-    p_map = load_p_map(PATH_TO_P_MAP, max_me_types=None)
+    print(f"Processing T-types from index {args.start} to {args.end}...")
+
+    # Ensure indices are valid
+    t_type_files = sorted(glob.glob(f"{PATH_TO_T_TYPES_NRRDS}/*.nrrd"))
+    if args.start < 0 or args.end >= len(t_type_files):
+        raise ValueError(f"Invalid range: {args.start} to {args.end}")
+
+    t_type_batch = t_type_files[args.start:args.end + 1]
+    print(f"Loaded {len(t_type_batch)} T-types for processing.")
+    t_type_batch_list = [f.split("/")[-1].replace(".nrrd", "") for f in t_type_batch]
+
+    # Load P-map
+    p_map = load_p_map(PATH_TO_P_MAP)
     setup_output_path(OUTPUT_PATH)
 
     # Load T-type paths
@@ -96,18 +128,33 @@ def main():
     t_type_list = sorted([f.replace(".nrrd", "") for f in t_type_files])
     t_type_paths = {t_type: os.path.join(PATH_TO_T_TYPES_NRRDS, f"{t_type}.nrrd") for t_type in t_type_list}
 
-    # Filter T-types to process in this batch
-    batch_t_types = t_type_list[args.start:args.end]
-    if not batch_t_types:
-        print(f"No T-types to process in range {args.start}-{args.end}.")
+    # Get MET-types and filter range
+    met_types = list(p_map.columns)
+    print("all met-types :", len(met_types))
+    print(t_type_batch_list[:10])
+    print(met_types[:10])
+    met_types_batch = [met_type for met_type in met_types if met_type.split("|")[2].replace("-","_") in t_type_batch_list] #met_types[args.start:args.end]
+    print("batch size :", len(met_types_batch))
+    if not met_types_batch:
+        print(f"No MET-types to process in range {args.start}-{args.end}.")
         return
 
-    # Get header from the first T-type
-    init_nrrd, header = nrrd.read(t_type_paths[batch_t_types[0]])
+    # Get header from a T-type
+    init_nrrd, header = nrrd.read(t_type_paths[t_type_list[0]])
 
-    # Process the batch
-    process_batch(batch_t_types, t_type_paths, p_map, OUTPUT_PATH, header)
+    # Process MET-types in parallel
+    batch_size = max(1, len(met_types_batch) // args.workers)
+    with ProcessPoolExecutor(max_workers=args.workers) as executor:
+        futures = []
+        for i in range(0, len(met_types_batch), batch_size):
+            batch = met_types_batch[i:i + batch_size]
+            futures.append(executor.submit(process_batch, batch, p_map, t_type_paths, OUTPUT_PATH, header["sizes"]))
+
+        for future in as_completed(futures):
+            try:
+                future.result()
+            except Exception as e:
+                print(f"Error in parallel execution: {e}")
 
 if __name__ == "__main__":
     main()
-
diff --git a/atlas_densities/app/process_t_types_job.sh b/atlas_densities/app/process_t_types_job.sh
index 651a810..a658554 100644
--- a/atlas_densities/app/process_t_types_job.sh
+++ b/atlas_densities/app/process_t_types_job.sh
@@ -30,11 +30,17 @@ fi
 
 # Calculate start and end indices for this job
 START=$((SLURM_ARRAY_TASK_ID * BATCH_SIZE))
-END=$((START + BATCH_SIZE))
-if [ $END -gt $TOTAL_T_TYPES ]; then
-  END=$TOTAL_T_TYPES
+END=$((START + BATCH_SIZE - 1))
+if [ $END -ge $TOTAL_T_TYPES ]; then
+  END=$((TOTAL_T_TYPES - 1))  # Adjust for the last batch
+fi
+
+# Ensure there is at least one T-type to process
+if [ $START -gt $END ]; then
+  echo "No T-types to process for task ID $SLURM_ARRAY_TASK_ID."
+  exit 0
 fi
 
 # Run the Python script
 echo "Processing T-types from index $START to $END..."
-python convert_t_type_nrrds_to_me_type_nrrds.py --start $START --end $END
\ No newline at end of file
+python convert_t_type_nrrds_to_me_type_nrrds.py --start $START --end $END --workers 4
\ No newline at end of file