From 38cfee2d1114f1955548128ff284659af0d9e271 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sat, 5 Oct 2024 13:39:24 -0400
Subject: [PATCH 01/22] ETL Tested and Ready

---
 seq_to_pheno/tcga/add_contigs.sh   |  40 ------
 seq_to_pheno/tcga/get_snp_eff.sh   |  45 -------
 seq_to_pheno/tcga/set_tcga_data.py | 198 +++++++++++++++++++++++++----
 3 files changed, 176 insertions(+), 107 deletions(-)
 delete mode 100755 seq_to_pheno/tcga/add_contigs.sh
 delete mode 100644 seq_to_pheno/tcga/get_snp_eff.sh
diff --git a/seq_to_pheno/tcga/add_contigs.sh b/seq_to_pheno/tcga/add_contigs.sh
deleted file mode 100755
index 7c80c7f..0000000
--- a/seq_to_pheno/tcga/add_contigs.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-# Directories
-# VCF_DIR="seq_to_pheno/tcga/data/variants/old_indel"    # Replace with the path to your VCF files
-# OUTPUT_DIR="seq_to_pheno/tcga/data/variants/indel"    # Replace with the desired output directory
-# CONTIGS_FILE="contigs.txt"           # Path to your contigs.txt file
-
-# mkdir -p "$OUTPUT_DIR"
-
-for vcf_file in "$VCF_DIR"/*.vcf.gz; do
-    vcf_basename=$(basename "$vcf_file")
-    output_file="$OUTPUT_DIR/$vcf_basename"
-
-    echo "Processing $vcf_basename"
-
-    # Add contig lines to the VCF header
-    bcftools annotate --header-lines "$CONTIGS_FILE" -O z -o "$output_file" "$vcf_file"
-
-    # Index the updated VCF file
-    tabix -p vcf "$output_file"
-done
-
-VCF_DIR="seq_to_pheno/tcga/data/variants/old_snv_mnv"    # Replace with the path to your VCF files
-OUTPUT_DIR="seq_to_pheno/tcga/data/variants/snv_mnv"    # Replace with the desired output directory
-CONTIGS_FILE="contigs.txt"           # Path to your contigs.txt file
-
-mkdir -p "$OUTPUT_DIR"
-
-for vcf_file in "$VCF_DIR"/*.vcf.gz; do
-    vcf_basename=$(basename "$vcf_file")
-    output_file="$OUTPUT_DIR/$vcf_basename"
-
-    echo "Processing $vcf_basename"
-
-    # Add contig lines to the VCF header
-    bcftools annotate --header-lines "$CONTIGS_FILE" -O z -o "$output_file" "$vcf_file"
-
-    # Index the updated VCF file
-    tabix -p vcf "$output_file"
-done
\ No newline at end of file
diff --git a/seq_to_pheno/tcga/get_snp_eff.sh b/seq_to_pheno/tcga/get_snp_eff.sh
deleted file mode 100644
index 3480c6f..0000000
--- a/seq_to_pheno/tcga/get_snp_eff.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-# Go to home dir
-cd
-
-# Download latest version
-wget https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip
-
-# Unzip file
-unzip snpEff_latest_core.zip
-
-# Download Annotation Database
-java -jar snpEff/snpEff.jar download -v GRCh37.75
-
-cat <<EOF > contigs.txt
-##contig=<ID=1,length=249250621>
-##contig=<ID=2,length=243199373>
-##contig=<ID=3,length=198022430>
-##contig=<ID=4,length=191154276>
-##contig=<ID=5,length=180915260>
-##contig=<ID=6,length=171115067>
-##contig=<ID=7,length=159138663>
-##contig=<ID=8,length=146364022>
-##contig=<ID=9,length=141213431>
-##contig=<ID=10,length=135534747>
-##contig=<ID=11,length=135006516>
-##contig=<ID=12,length=133851895>
-##contig=<ID=13,length=115169878>
-##contig=<ID=14,length=107349540>
-##contig=<ID=15,length=102531392>
-##contig=<ID=16,length=90354753>
-##contig=<ID=17,length=81195210>
-##contig=<ID=18,length=78077248>
-##contig=<ID=19,length=59128983>
-##contig=<ID=20,length=63025520>
-##contig=<ID=21,length=48129895>
-##contig=<ID=22,length=51304566>
-##contig=<ID=X,length=155270560>
-##contig=<ID=Y,length=59373566>
-##contig=<ID=MT,length=16569>
-EOF
-
-# This adds contigs to all VCF files for proper GRCh37 indexing
-# Adjust the add_contogs.sh script to point to your own indel and snv_mnv directory
-chmod +x add_contigs.sh
-./add_contigs.sh
-
diff --git a/seq_to_pheno/tcga/set_tcga_data.py b/seq_to_pheno/tcga/set_tcga_data.py
index 7b26a71..e3f841c 100644
--- a/seq_to_pheno/tcga/set_tcga_data.py
+++ b/seq_to_pheno/tcga/set_tcga_data.py
@@ -11,12 +11,126 @@
 import sys
 import requests
 from Bio.Seq import Seq
+import datetime
+import shutil
 
 
+# NOTE: Script shall be run from the top of the repo
+os.chdir('/Users/harrison.reed/codebase/seq-to-pheno')
+
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
 
-def download_file(s3_path: str, local_dir: str, endpoint_url: str) -> None:
+def prepare_reference(ref):
+  if not os.path.exists(ref) and not os.path.exists(ref.replace('.gz', '')):
+    wget_file('https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz', ref)
+    unzip_zip(ref)
+    index_reference(ref.replace('.gz', ''))
+
+
+def add_contigs(snv_dir, indel_dir, contigs_file):
+  for vcf_dir in [snv_dir, indel_dir]:
+    for filename in os.listdir(vcf_dir):
+      if filename.endswith('.vcf') or filename.endswith('.vcf.gz'):
+        vcf_input_path = os.path.join(vcf_dir, filename)
+        print(f"Processing {filename}...")
+        add_contigs_with_bcftools(vcf_input_path, contigs_file)
+
+
+def add_contigs_with_bcftools(vcf_input_path, contigs_file):
+    temp_header = vcf_input_path + '.header.txt'
+    new_header = vcf_input_path + '.new_header.txt'
+    temp_vcf = vcf_input_path + '.tmp.vcf.gz'
+
+    # Step 1: Extract the VCF header
+    with open(temp_header, 'w') as header_file:
+        cmd = ['bcftools', 'view', '-h', vcf_input_path]
+        logging.info(f'Extracting header from {vcf_input_path}')
+        subprocess.run(cmd, stdout=header_file, check=True)
+
+    # Step 2 and 3: Remove existing contig lines and insert new contig lines before #CHROM line
+    with open(temp_header, 'r') as infile, open(new_header, 'w') as outfile:
+        for line in infile:
+            if line.startswith('##contig='):
+                continue  # Skip existing contig lines
+            elif line.startswith('#CHROM'):
+                # Insert new contig lines before the #CHROM line
+                with open(contigs_file, 'r') as contigs:
+                    outfile.writelines(contigs)
+                outfile.write(line)  # Write the #CHROM line
+            else:
+                outfile.write(line)
+
+    # Step 4: Replace the VCF header
+    cmd = ['bcftools', 'reheader', '-h', new_header, '-o', temp_vcf, vcf_input_path]
+    logging.info(f'Replacing header of {vcf_input_path}')
+    subprocess.run(cmd, check=True)
+
+    # Step 5: Replace the original VCF file with the updated one
+    shutil.move(temp_vcf, vcf_input_path)
+
+    # Step 6: Index the updated VCF file
+    cmd = ['tabix', '-p', 'vcf', vcf_input_path]
+    logging.info(f'Indexing {vcf_input_path}')
+    subprocess.run(cmd, check=True)
+
+    # Clean up temporary files
+    os.remove(temp_header)
+    os.remove(new_header)
+
+
+def create_contigs(fai_file, contigs_file):
+    with open(fai_file, 'r') as fai, open(contigs_file, 'w') as out:
+        for line in fai:
+            fields = line.strip().split('\t')
+            contig_id = fields[0]
+            contig_length = fields[1]
+            out.write(f'##contig=<ID={contig_id},length={contig_length}>\n')
+
+
+def index_reference(ref):
+  try:
+    command = ['samtools', 'faidx', ref]
+    logging.info(f"Indexing reference sequence {ref}")
+    subprocess.run(command, check=True)
+  except subprocess.CalledProcessError as e:
+    logging.error(f"Failed to index reference sequence {ref}: {e}")
+
+def download_annotation_database(snpeff_jar, genome):
+  try:
+    command = ['java', '-jar', snpeff_jar, 'download', '-v', genome]
+    logging.info("Downloading annotation database")
+    subprocess.run(command, check=True)
+  except subprocess.CalledProcessError as e:
+    logging.error(f"Failed to download annotation database {genome}: {e}")
+
+def unzip_zip(zip):
+  try:
+    command = ['unzip', zip]
+    logging.info(f"Unzipping {zip} to {zip.replace('.zip', '')}")
+    subprocess.run(command, check=True)
+  except subprocess.CalledProcessError as e:
+    logging.error(f"Failed to unzip {zip}: {e}")
+    
+def gunzip(gz):
+  try:
+    command = ['gunzip', gz]
+    logging.info(f"gunzipping {gz} to {gz.replace('.gz', '')}")
+    subprocess.run(command, check=True)
+  except subprocess.CalledProcessError as e:
+    logging.error(f"Failed to gunzip {gz}: {e}")
+
+
+def wget_file(url, out_path):
+  try:
+    command = ['wget', '-O', out_path, url]
+    logging.info(f"Downloading {url} to {out_path}")
+    subprocess.run(command, check=True)
+  except subprocess.CalledProcessError as e:
+    logging.error(f"Failed to download {url}: {e}")
+
+
+def download_s3_file(s3_path: str, local_dir: str, endpoint_url: str) -> None:
     """
     Downloads a file from an S3 bucket using AWS CLI.
 
@@ -76,7 +190,8 @@ def get_wgs_aliquot_ids_from_filenames(directory: str) -> Set[str]:
 
 def add_has_consensus_data_column(metadata_df: pd.DataFrame, wgs_ids_set: Set[str]) -> pd.DataFrame:
     """
-    Adds a 'has_consensus_data' column to the metadata DataFrame.
+    Adds a 'has_consensus_data' column to the metadata DataFrame. This ensures
+    that we highlight subjects that have a complete dataset.
 
     Parameters:
         metadata_df (pd.DataFrame): The original metadata DataFrame.
@@ -320,23 +435,42 @@ def main():
     ENDPOINT_URL = 'https://object.genomeinformatics.org'
     DATA_DIR = 'seq_to_pheno/tcga/data'
     VARIANT_DIR = os.path.join(DATA_DIR, 'variants')
-    TRANSCRIPT_DATA_S3 = 's3://icgc25k-open/PCAWG/transcriptome/transcript_expression/pcawg.rnaseq.transcript.expr.tpm.tsv.gz'
-    METADATA_S3 = 's3://icgc25k-open/PCAWG/transcriptome/metadata/rnaseq.extended.metadata.aliquot_id.V4.tsv.gz'
-    SNV_INDEL_S3 = 's3://icgc25k-open/PCAWG/consensus_snv_indel/final_consensus_snv_indel_passonly_icgc.public.tgz'
+    SNV_DIR = os.path.join(VARIANT_DIR, 'snv_mnv/')
+    INDEL_DIR = os.path.join(VARIANT_DIR, 'indel')
+    TRANSCRIPT_BASENAME = 'pcawg.rnaseq.transcript.expr.tpm.tsv.gz'
+    TRANSCRIPT_DATA_S3 = f's3://icgc25k-open/PCAWG/transcriptome/transcript_expression/{TRANSCRIPT_BASENAME}'
+    METADATA_BASENAME = 'rnaseq.extended.metadata.aliquot_id.V4.tsv.gz'
+    METADATA_S3 = f's3://icgc25k-open/PCAWG/transcriptome/metadata/{METADATA_BASENAME}'
+    SNV_INDEL_BASENAME = "final_consensus_snv_indel_passonly_icgc.public.tgz"
+    SNV_INDEL_S3 = f's3://icgc25k-open/PCAWG/consensus_snv_indel/{SNV_INDEL_BASENAME}'
     SNPEFF_JAR = '/Users/harrison.reed/snpEff/snpEff.jar'  # Update with the correct path to your snpEff.jar
-    GENOME_VERSION = 'GRCh37.75'      # Ensure this is the genome version you downloaded for snpEff
-
+    REFERENCE_SHORT = 'GRCh37.75' # Make sure snpEff and reference genome match!
+    REFERENCE_FA = os.path.join(DATA_DIR, 'genome.fa.gz')
+    REFERENCE_GTF = os.path.join(DATA_DIR, 'Homo_sapiens.GRCh37.75.gtf.gz')
+    
     # Create data directories if they don't exist
     os.makedirs(DATA_DIR, exist_ok=True)
     os.makedirs(VARIANT_DIR, exist_ok=True)
 
     # Download files
-    download_file(TRANSCRIPT_DATA_S3, DATA_DIR, ENDPOINT_URL)
-    download_file(METADATA_S3, DATA_DIR, ENDPOINT_URL)
-    download_file(SNV_INDEL_S3, DATA_DIR, ENDPOINT_URL)
-
-    # Extract SNV and Indel data
-    extract_tar_gz(os.path.join(DATA_DIR, 'final_consensus_snv_indel_passonly_icgc.public.tgz'), VARIANT_DIR)
+    logging.info('Downloading ICGC data: Transcript Expression ')
+    if not os.path.exists(os.path.join(DATA_DIR, TRANSCRIPT_BASENAME)):
+      if not os.path.exists(os.path.join(DATA_DIR, TRANSCRIPT_BASENAME.replace(".gz", ''))):
+        logging.info(os.path.join(DATA_DIR, TRANSCRIPT_BASENAME.replace(".gz", '')))
+        download_s3_file(TRANSCRIPT_DATA_S3, DATA_DIR, ENDPOINT_URL)
+    
+    logging.info('Downloading ICGC data: Metadata')
+    if not os.path.exists(os.path.join(DATA_DIR, METADATA_BASENAME)):
+      if not os.path.exists(os.path.join(DATA_DIR, METADATA_BASENAME.replace(".gz", ''))):
+        logging.info(os.path.join(DATA_DIR, METADATA_BASENAME.replace(".gz", '')))
+        download_s3_file(METADATA_S3, DATA_DIR, ENDPOINT_URL)
+              
+    logging.info('Downloading ICGC data: SNV and Indels ')
+    if not os.path.exists(os.path.join(DATA_DIR, SNV_INDEL_BASENAME)):
+      if not os.path.exists(os.path.join(DATA_DIR, SNV_INDEL_BASENAME.replace(".gz", ''))):
+        logging.info(os.path.join(DATA_DIR, SNV_INDEL_BASENAME.replace(".gz", '')))
+        download_s3_file(SNV_INDEL_S3, DATA_DIR, ENDPOINT_URL)
+        extract_tar_gz(os.path.join(DATA_DIR, 'final_consensus_snv_indel_passonly_icgc.public.tgz'), VARIANT_DIR)
 
     # Paths to variant directories
     indel_dir = os.path.join(VARIANT_DIR, 'indel')
@@ -369,15 +503,34 @@ def main():
     metadata_df.to_csv(updated_metadata_file, sep='\t', index=False)
     logging.info(f"Updated metadata saved to {updated_metadata_file}")
 
-    # Load Ensembl GTF file
-    gtf_file = os.path.join(DATA_DIR, 'Homo_sapiens.GRCh37.87.gtf.gz')
-    if not os.path.exists(gtf_file):
-        logging.info("Downloading Ensembl GTF file")
-        gtf_url = 'ftp://ftp.ensembl.org/pub/grch37/release-87/gtf/homo_sapiens/Homo_sapiens.GRCh37.87.gtf.gz'
-        download_file(gtf_url, DATA_DIR, '')
-
+    logging.info("Checking for GTF")
+    if not os.path.exists(REFERENCE_GTF) and not os.path.exists(REFERENCE_GTF.replace('.gz', '')):
+      gtf_url = 'https://ftp.ensembl.org/pub/release-75/gtf/homo_sapiens/Homo_sapiens.GRCh37.75.gtf.gz'
+      wget_file(gtf_url, REFERENCE_GTF)
+    logging.info(f'{REFERENCE_GTF}')
+
+    # Verify SnpEff Jar exists
+    logging.info("Checking for SnpEff jar")
+    if not os.path.exists(SNPEFF_JAR):
+      wget_file('https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip', 'snpEff_latest_core.zip')
+      unzip_zip('snpEff_latest_core.zip')
+      SNPEFF_JAR = 'snpEff/snpEff.jar'
+    logging.info(f"{SNPEFF_JAR}")    
+      
+    logging.info('Preparing annotation database')
+    # download_annotation_database(SNPEFF_JAR, REFERENCE_SHORT)
+    
+    logging.info("Preparing reference genome")
+    prepare_reference(REFERENCE_FA)
+
+    logging.info("Preparing VCF contigs using reference")
+    contigs_file = os.path.join(DATA_DIR, 'contigs.txt')
+    if not os.path.exists(contigs_file):
+      create_contigs(REFERENCE_FA.replace(".gz", '.fai'), contigs_file)
+      add_contigs(SNV_DIR, INDEL_DIR, contigs_file)
+  
     logging.info("Parsing GTF file to get transcript coordinates")
-    transcript_df = parse_gtf(gtf_file)
+    transcript_df = parse_gtf(REFERENCE_GTF)
 
     logging.info("Building interval trees for transcripts")
     interval_trees = build_interval_tree(transcript_df)
@@ -387,7 +540,8 @@ def main():
     variant_counts_df = process_all_samples(metadata_df, interval_trees, VARIANT_DIR, SNPEFF_JAR)
 
     # Save variant counts table
-    variant_counts_file = os.path.join(DATA_DIR, 'variant_counts_per_transcript.tsv')
+    now = datetime.datetime.now().strftime("%Y_%m_%d")
+    variant_counts_file = os.path.join(DATA_DIR, f'variant_counts_per_transcript_{now}.tsv')
     variant_counts_df.to_csv(variant_counts_file, sep='\t', header=True)
     logging.info(f"Variant counts per transcript saved to {variant_counts_file}")
 

From e1791eded81c76e3755400b31b116b829d05b98e Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sat, 5 Oct 2024 13:55:10 -0400
Subject: [PATCH 02/22] Updating README

---
 seq_to_pheno/tcga/README.md        | 33 ++++++++++++++++++++----------
 seq_to_pheno/tcga/set_tcga_data.py |  6 ++----
 2 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/seq_to_pheno/tcga/README.md b/seq_to_pheno/tcga/README.md
index 0e2df48..f8ce219 100644
--- a/seq_to_pheno/tcga/README.md
+++ b/seq_to_pheno/tcga/README.md
@@ -1,13 +1,13 @@
-# TCGA
+# ICGC/TCGA Preprocessing
 
-Space for processing TCGA data
+Space for processing ICGC/TCGA data
 
-## Datasets
+### Source
 The ICGC (International Cancer Genome Consortium) in collaboration with TCGA and the Sanger Institute has created a hub for a wide-range of NGS data and metadata across various cancer types.
 
 Instructions on how to access the data are [found on this page](https://docs.icgc-argo.org/docs/data-access/icgc-25k-data).
 
-### Get
+## ETL Overview
 **get normalized transcript expression data**
 
 ```
@@ -93,14 +93,25 @@ And here's a look at the format of one of these VCF files (after decompression):
 
 What we see here is information about the location of variants found in this subject, along with relevant information like alt_count and ref_count (which can be used to define allele depth and genotypes), as well as information related to the Variant_Classification, and the type of sequencing used to generate the data.
 
-### Mapping to transcripts
-The next import step is to map the transcript expression data to the variant calls we have for each sample. 
+### Annotate Variants
+For downstream analysis we will need to filter our variants based on their predicted protein-level effect. To do so, we will annotate our variants using SnpEff for variant predictions.
 
-I've created a script called **set_tcga_data.py** that should set up everything described above while also doing some work to aggregate variants across each transcript and subject. 
+This will require some extra steps to prepare. We must make sure that we are using the correct reference genome (as described in the PCAWG documentation). We also need to make sure we are using the correct reference GTF -- which will be needed to construct the SnpEff database. 
 
-The table you get at the end:
-- **variant_counts_per_transcript.tsv**
+And then finally, we will need to make sure to add contig-tags to each VCF file. The contig-tags are created from the reference genome used to make the original variant calls (GRCh37.75).
 
- This should be similar to the ICGC transcript expression table -- where the first column has the Ensembl transcript_id, and each column after will correspond to each of our samples from the metadata.
+All of the steps above can be run using ```set_tcga_data.py``` **just make sure to adjust your own filepaths**.
 
-The cell values are the total variant count found for each transcript's genomic locus and for each subject. 
\ No newline at end of file
+>NOTE: You will need access to both bcftools and samtools. If you have any issues setting up -- feel free to reach out to me (Harrison/Almuraqib) or tinker on your own.
+
+> NOTE: This script will take a long time to run, it might make sense to subset the list to only a handful of subjects first for downstream testing purposes.
+
+### Merge Annotations
+Once we have successfully preprocessed our annotations, we will need to combine the indels and snvs into a single vcf file. 
+
+We can run this using ```merge_annotations.py```
+
+### Get sequences
+We can then take these filtered and merged variant calls and then generate protein sequences based on the variants found. It will account for all overlapping variants in a transcript and create a protein-fasta file of the resulting protein-sequence. 
+
+We can run this using ```get_sequences.py```
\ No newline at end of file
diff --git a/seq_to_pheno/tcga/set_tcga_data.py b/seq_to_pheno/tcga/set_tcga_data.py
index e3f841c..f322ae7 100644
--- a/seq_to_pheno/tcga/set_tcga_data.py
+++ b/seq_to_pheno/tcga/set_tcga_data.py
@@ -14,10 +14,6 @@
 import datetime
 import shutil
 
-
-# NOTE: Script shall be run from the top of the repo
-os.chdir('/Users/harrison.reed/codebase/seq-to-pheno')
-
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
 
@@ -431,6 +427,8 @@ def process_all_samples(metadata_df: pd.DataFrame, interval_trees: dict, variant
 
 def main():
 
+    # NOTE: Script shall be run from the top of the repo
+    os.chdir('/Users/harrison.reed/codebase/seq-to-pheno') # USE YOUR OWN PATHS
     # Define constants or load them from a config file
     ENDPOINT_URL = 'https://object.genomeinformatics.org'
     DATA_DIR = 'seq_to_pheno/tcga/data'

From c21c01108f50fa98db64690b0ad4ce4949ccfd29 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sat, 5 Oct 2024 14:04:54 -0400
Subject: [PATCH 03/22] uncommenting annotation db stuff

---
 seq_to_pheno/tcga/set_tcga_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seq_to_pheno/tcga/set_tcga_data.py b/seq_to_pheno/tcga/set_tcga_data.py
index f322ae7..d9383a3 100644
--- a/seq_to_pheno/tcga/set_tcga_data.py
+++ b/seq_to_pheno/tcga/set_tcga_data.py
@@ -516,7 +516,7 @@ def main():
     logging.info(f"{SNPEFF_JAR}")    
       
     logging.info('Preparing annotation database')
-    # download_annotation_database(SNPEFF_JAR, REFERENCE_SHORT)
+    download_annotation_database(SNPEFF_JAR, REFERENCE_SHORT)
     
     logging.info("Preparing reference genome")
     prepare_reference(REFERENCE_FA)

From 19c72570cd27583051e818ba4c16317b04ea626a Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sat, 5 Oct 2024 19:56:09 -0400
Subject: [PATCH 04/22] Added parallelization to optimize sample annotations

---
 seq_to_pheno/tcga/set_tcga_data.py | 95 +++++++++++++++++-------------
 1 file changed, 54 insertions(+), 41 deletions(-)

diff --git a/seq_to_pheno/tcga/set_tcga_data.py b/seq_to_pheno/tcga/set_tcga_data.py
index d9383a3..ee7be8f 100644
--- a/seq_to_pheno/tcga/set_tcga_data.py
+++ b/seq_to_pheno/tcga/set_tcga_data.py
@@ -9,10 +9,9 @@
 from intervaltree import IntervalTree
 import glob
 import sys
-import requests
-from Bio.Seq import Seq
-import datetime
 import shutil
+import concurrent.futures
+import datetime
 
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
@@ -340,14 +339,21 @@ def process_sample(args):
     Processes a single sample to count variants per transcript.
 
     Parameters:
-        args (tuple): A tuple containing sample_id, wgs_aliquot_id, interval_trees, variant_dir, snpeff_jar
+        args (tuple): A tuple containing sample_id, wgs_aliquot_id, variant_dir, snpeff_jar, reference_gtf
 
     Returns:
-        pd.DataFrame or None: DataFrame with transcript IDs as index and variant counts for this sample, or None if no data.
+        tuple: (sample_id, variant_counts_dict) or (sample_id, None) if no variants found.
     """
-    sample_id, wgs_aliquot_id, interval_trees, variant_dir, snpeff_jar = args
+    sample_id, wgs_aliquot_id, variant_dir, snpeff_jar, reference_gtf = args
     variant_counts = defaultdict(int)
 
+    # Initialize interval_trees
+    if 'interval_trees' not in globals():
+        global interval_trees
+        logging.info(f"Building interval trees in worker process for sample {sample_id}")
+        transcript_df = parse_gtf(reference_gtf)
+        interval_trees = build_interval_tree(transcript_df)
+
     # Paths to VCF files
     snv_vcf_pattern = os.path.join(variant_dir, 'snv_mnv', f"{wgs_aliquot_id}.consensus.*.somatic.snv_mnv.vcf.gz")
     indel_vcf_pattern = os.path.join(variant_dir, 'indel', f"{wgs_aliquot_id}.consensus.*.somatic.indel.vcf.gz")
@@ -383,48 +389,57 @@ def process_sample(args):
         logging.warning(f"Indel VCF not found for sample {sample_id}")
 
     if variant_counts:
-        # Create DataFrame for this sample
-        df_sample = pd.DataFrame.from_dict(variant_counts, orient='index', columns=[sample_id])
-        return df_sample
+        return sample_id, dict(variant_counts)
     else:
         logging.warning(f"No variants found for sample {sample_id}")
-        return None
+        return sample_id, None
 
-def process_all_samples(metadata_df: pd.DataFrame, interval_trees: dict, variant_dir: str, snpeff_jar: str) -> pd.DataFrame:
-    """
-    Processes all samples to count variants per transcript.
 
-    Parameters:
-        metadata_df (pd.DataFrame): Metadata DataFrame with samples to process.
-        interval_trees (dict): Interval trees of transcripts.
-        variant_dir (str): Directory containing variant files.
-        snpeff_jar (str): Path to snpEff.jar file.
+def process_all_samples(metadata_df: pd.DataFrame, variant_dir: str, snpeff_jar: str, reference_gtf: str) -> pd.DataFrame:
+    '''
+    Processes all samples to count variants per transcript using parallel processing.
 
     Returns:
         pd.DataFrame: DataFrame with transcripts as rows and samples as columns.
-    """
-    data_frames = []
+    '''
+    args_list = []
     for idx, row in metadata_df.iterrows():
         sample_id = row['aliquot_id']
         wgs_aliquot_id = row['wgs_aliquot_id']
-        logging.info(f"Processing sample {sample_id}")
-        args = (sample_id, wgs_aliquot_id, interval_trees, variant_dir, snpeff_jar)
-        df_sample = process_sample(args)
-        if df_sample is not None:
-            data_frames.append(df_sample)
-
-    # Concatenate all sample DataFrames
-    if data_frames:
-        variant_counts_df = pd.concat(data_frames, axis=1)
-        # Fill missing values with 0
-        variant_counts_df = variant_counts_df.fillna(0).astype(int)
-        variant_counts_df.index.name = 'transcript_id'
-        variant_counts_df = variant_counts_df.sort_index()
-    else:
-        variant_counts_df = pd.DataFrame()
+        logging.info(f"Preparing to process sample {sample_id}")
+        args = (sample_id, wgs_aliquot_id, variant_dir, snpeff_jar, reference_gtf)
+        args_list.append(args)
+
+    num_workers = os.cpu_count() - 1 or 1
+    all_variant_counts = {}
+
+    with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
+        futures = {executor.submit(process_sample, args): args[0] for args in args_list}
+        for future in concurrent.futures.as_completed(futures):
+            sample_id = futures[future]
+            try:
+                result = future.result()
+                if result:
+                    sample_id, variant_counts = result
+                    if variant_counts is not None:
+                        all_variant_counts[sample_id] = variant_counts
+                    else:
+                        logging.warning(f"No variant counts for sample {sample_id}")
+            except Exception as exc:
+                logging.error(f"Sample {sample_id} generated an exception: {exc}")
+
+    # Build DataFrame from the collected variant counts
+    variant_counts_df = pd.DataFrame.from_dict(all_variant_counts, orient='index').fillna(0).astype(int)
+    variant_counts_df.index.name = 'sample_id'
+    variant_counts_df = variant_counts_df.sort_index()
+
+    # Transpose to have transcripts as rows and samples as columns
+    variant_counts_df = variant_counts_df.transpose()
+    variant_counts_df.index.name = 'transcript_id'
 
     return variant_counts_df
 
+
 def main():
 
     # NOTE: Script shall be run from the top of the repo
@@ -434,7 +449,8 @@ def main():
     DATA_DIR = 'seq_to_pheno/tcga/data'
     VARIANT_DIR = os.path.join(DATA_DIR, 'variants')
     SNV_DIR = os.path.join(VARIANT_DIR, 'snv_mnv/')
-    INDEL_DIR = os.path.join(VARIANT_DIR, 'indel')
+    INDEL_DIR = os.path.join(VARIANT_DIR, 'indel/')
+    COUNTS_DIR = os.path.join(VARIANT_DIR, 'counts/')
     TRANSCRIPT_BASENAME = 'pcawg.rnaseq.transcript.expr.tpm.tsv.gz'
     TRANSCRIPT_DATA_S3 = f's3://icgc25k-open/PCAWG/transcriptome/transcript_expression/{TRANSCRIPT_BASENAME}'
     METADATA_BASENAME = 'rnaseq.extended.metadata.aliquot_id.V4.tsv.gz'
@@ -449,6 +465,7 @@ def main():
     # Create data directories if they don't exist
     os.makedirs(DATA_DIR, exist_ok=True)
     os.makedirs(VARIANT_DIR, exist_ok=True)
+    os.makedirs(COUNTS_DIR, exist_ok=True)
 
     # Download files
     logging.info('Downloading ICGC data: Transcript Expression ')
@@ -530,12 +547,9 @@ def main():
     logging.info("Parsing GTF file to get transcript coordinates")
     transcript_df = parse_gtf(REFERENCE_GTF)
 
-    logging.info("Building interval trees for transcripts")
-    interval_trees = build_interval_tree(transcript_df)
-
     # Process samples to get variant counts per transcript
     logging.info("Processing samples to count variants per transcript")
-    variant_counts_df = process_all_samples(metadata_df, interval_trees, VARIANT_DIR, SNPEFF_JAR)
+    variant_counts_df = process_all_samples(metadata_df, VARIANT_DIR, SNPEFF_JAR, REFERENCE_GTF)
 
     # Save variant counts table
     now = datetime.datetime.now().strftime("%Y_%m_%d")
@@ -543,6 +557,5 @@ def main():
     variant_counts_df.to_csv(variant_counts_file, sep='\t', header=True)
     logging.info(f"Variant counts per transcript saved to {variant_counts_file}")
 
-
 if __name__ == '__main__':
     main()
\ No newline at end of file

From 108b3c22bfae6e20989eef4394261b3e380734ce Mon Sep 17 00:00:00 2001
From: HReed1 <HarrisonHVReed@gmail.com>
Date: Sun, 6 Oct 2024 00:12:49 -0400
Subject: [PATCH 05/22] Fixing snpEff download

---
 seq_to_pheno/tcga/set_tcga_data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/seq_to_pheno/tcga/set_tcga_data.py b/seq_to_pheno/tcga/set_tcga_data.py
index ee7be8f..a6e08aa 100644
--- a/seq_to_pheno/tcga/set_tcga_data.py
+++ b/seq_to_pheno/tcga/set_tcga_data.py
@@ -457,7 +457,8 @@ def main():
     METADATA_S3 = f's3://icgc25k-open/PCAWG/transcriptome/metadata/{METADATA_BASENAME}'
     SNV_INDEL_BASENAME = "final_consensus_snv_indel_passonly_icgc.public.tgz"
     SNV_INDEL_S3 = f's3://icgc25k-open/PCAWG/consensus_snv_indel/{SNV_INDEL_BASENAME}'
-    SNPEFF_JAR = '/Users/harrison.reed/snpEff/snpEff.jar'  # Update with the correct path to your snpEff.jar
+    SNPEFF_DIR = '/home/harrisonhvaughnreed/snpEff/'  # Update with the correct path to your snpEff.jar
+    SNPEFF_JAR = os.path.join(SNPEFF_DIR, 'snpEff.jar')  # Update with the correct path to your snpEff.jar
     REFERENCE_SHORT = 'GRCh37.75' # Make sure snpEff and reference genome match!
     REFERENCE_FA = os.path.join(DATA_DIR, 'genome.fa.gz')
     REFERENCE_GTF = os.path.join(DATA_DIR, 'Homo_sapiens.GRCh37.75.gtf.gz')
@@ -529,7 +530,7 @@ def main():
     if not os.path.exists(SNPEFF_JAR):
       wget_file('https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip', 'snpEff_latest_core.zip')
       unzip_zip('snpEff_latest_core.zip')
-      SNPEFF_JAR = 'snpEff/snpEff.jar'
+      os.makedirs(SNPEFF_DIR, exist_ok=True)
     logging.info(f"{SNPEFF_JAR}")    
       
     logging.info('Preparing annotation database')

From 680c33dbb528bf5b6876d467162bcaa190701a3f Mon Sep 17 00:00:00 2001
From: HReed1 <HarrisonHVReed@gmail.com>
Date: Sun, 6 Oct 2024 00:36:29 -0400
Subject: [PATCH 06/22] fixing snpEff path again

---
 seq_to_pheno/tcga/set_tcga_data.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/seq_to_pheno/tcga/set_tcga_data.py b/seq_to_pheno/tcga/set_tcga_data.py
index a6e08aa..2572e8c 100644
--- a/seq_to_pheno/tcga/set_tcga_data.py
+++ b/seq_to_pheno/tcga/set_tcga_data.py
@@ -443,7 +443,7 @@ def process_all_samples(metadata_df: pd.DataFrame, variant_dir: str, snpeff_jar:
 def main():
 
     # NOTE: Script shall be run from the top of the repo
-    os.chdir('/Users/harrison.reed/codebase/seq-to-pheno') # USE YOUR OWN PATHS
+    os.chdir('/home/harrisonhvaughnreed/Projects/seq-to-pheno') # USE YOUR OWN PATHS
     # Define constants or load them from a config file
     ENDPOINT_URL = 'https://object.genomeinformatics.org'
     DATA_DIR = 'seq_to_pheno/tcga/data'
@@ -528,9 +528,8 @@ def main():
     # Verify SnpEff Jar exists
     logging.info("Checking for SnpEff jar")
     if not os.path.exists(SNPEFF_JAR):
-      wget_file('https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip', 'snpEff_latest_core.zip')
-      unzip_zip('snpEff_latest_core.zip')
-      os.makedirs(SNPEFF_DIR, exist_ok=True)
+      wget_file('https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip', os.path.join(SNPEFF_DIR, 'snpEff_latest_core.zip'))
+      unzip_zip(os.path.join(SNPEFF_DIR, 'snpEff_latest_core.zip'))
     logging.info(f"{SNPEFF_JAR}")    
       
     logging.info('Preparing annotation database')

From bf3ee9276d7afc3338c6d8022bd33bbeefeb3e5a Mon Sep 17 00:00:00 2001
From: HReed1 <HarrisonHVReed@gmail.com>
Date: Sun, 6 Oct 2024 00:40:24 -0400
Subject: [PATCH 07/22] reference must be gunzipped not unzipped

---
 seq_to_pheno/tcga/set_tcga_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/seq_to_pheno/tcga/set_tcga_data.py b/seq_to_pheno/tcga/set_tcga_data.py
index 2572e8c..0b18774 100644
--- a/seq_to_pheno/tcga/set_tcga_data.py
+++ b/seq_to_pheno/tcga/set_tcga_data.py
@@ -19,7 +19,7 @@
 def prepare_reference(ref):
   if not os.path.exists(ref) and not os.path.exists(ref.replace('.gz', '')):
     wget_file('https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz', ref)
-    unzip_zip(ref)
+    gunzip(ref)
     index_reference(ref.replace('.gz', ''))
 
 
@@ -457,7 +457,7 @@ def main():
     METADATA_S3 = f's3://icgc25k-open/PCAWG/transcriptome/metadata/{METADATA_BASENAME}'
     SNV_INDEL_BASENAME = "final_consensus_snv_indel_passonly_icgc.public.tgz"
     SNV_INDEL_S3 = f's3://icgc25k-open/PCAWG/consensus_snv_indel/{SNV_INDEL_BASENAME}'
-    SNPEFF_DIR = '/home/harrisonhvaughnreed/snpEff/'  # Update with the correct path to your snpEff.jar
+    SNPEFF_DIR = '/home/harrisonhvaughnreed/'  # Update with the correct path to your snpEff.jar
     SNPEFF_JAR = os.path.join(SNPEFF_DIR, 'snpEff.jar')  # Update with the correct path to your snpEff.jar
     REFERENCE_SHORT = 'GRCh37.75' # Make sure snpEff and reference genome match!
     REFERENCE_FA = os.path.join(DATA_DIR, 'genome.fa.gz')

From 2a6cd959b1ef767c1aa85813f718a27d2d2192c2 Mon Sep 17 00:00:00 2001
From: HReed1 <HarrisonHVReed@gmail.com>
Date: Sun, 6 Oct 2024 02:23:21 -0400
Subject: [PATCH 08/22] Correcting some paths

---
 seq_to_pheno/tcga/set_tcga_data.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/seq_to_pheno/tcga/set_tcga_data.py b/seq_to_pheno/tcga/set_tcga_data.py
index 0b18774..ba7c6e5 100644
--- a/seq_to_pheno/tcga/set_tcga_data.py
+++ b/seq_to_pheno/tcga/set_tcga_data.py
@@ -18,7 +18,7 @@
 
 def prepare_reference(ref):
   if not os.path.exists(ref) and not os.path.exists(ref.replace('.gz', '')):
-    wget_file('https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.fna.gz', ref)
+    wget_file('https://ftp.sanger.ac.uk/pub/project/PanCancer/genome.fa.gz', ref)
     gunzip(ref)
     index_reference(ref.replace('.gz', ''))
 
@@ -99,10 +99,10 @@ def download_annotation_database(snpeff_jar, genome):
   except subprocess.CalledProcessError as e:
     logging.error(f"Failed to download annotation database {genome}: {e}")
 
-def unzip_zip(zip):
+def unzip_zip(zip, outdir):
   try:
-    command = ['unzip', zip]
-    logging.info(f"Unzipping {zip} to {zip.replace('.zip', '')}")
+    command = ['unzip', zip, '-d', outdir]
+    logging.info(f"Unzipping {zip} to {outdir}")
     subprocess.run(command, check=True)
   except subprocess.CalledProcessError as e:
     logging.error(f"Failed to unzip {zip}: {e}")
@@ -443,7 +443,9 @@ def process_all_samples(metadata_df: pd.DataFrame, variant_dir: str, snpeff_jar:
 def main():
 
     # NOTE: Script shall be run from the top of the repo
-    os.chdir('/home/harrisonhvaughnreed/Projects/seq-to-pheno') # USE YOUR OWN PATHS
+    HOME_DIR = '/home/harrisonhvaughnreed/'
+    BASE_DIR = os.path.join(HOME_DIR, 'Projects/seq-to-pheno')
+    os.chdir(BASE_DIR) # USE YOUR OWN PATHS
     # Define constants or load them from a config file
     ENDPOINT_URL = 'https://object.genomeinformatics.org'
     DATA_DIR = 'seq_to_pheno/tcga/data'
@@ -457,7 +459,7 @@ def main():
     METADATA_S3 = f's3://icgc25k-open/PCAWG/transcriptome/metadata/{METADATA_BASENAME}'
     SNV_INDEL_BASENAME = "final_consensus_snv_indel_passonly_icgc.public.tgz"
     SNV_INDEL_S3 = f's3://icgc25k-open/PCAWG/consensus_snv_indel/{SNV_INDEL_BASENAME}'
-    SNPEFF_DIR = '/home/harrisonhvaughnreed/'  # Update with the correct path to your snpEff.jar
+    SNPEFF_DIR = os.path.join(HOME_DIR, 'snpEff/')  # Update with the correct path to your snpEff.jar
     SNPEFF_JAR = os.path.join(SNPEFF_DIR, 'snpEff.jar')  # Update with the correct path to your snpEff.jar
     REFERENCE_SHORT = 'GRCh37.75' # Make sure snpEff and reference genome match!
     REFERENCE_FA = os.path.join(DATA_DIR, 'genome.fa.gz')
@@ -528,8 +530,8 @@ def main():
     # Verify SnpEff Jar exists
     logging.info("Checking for SnpEff jar")
     if not os.path.exists(SNPEFF_JAR):
-      wget_file('https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip', os.path.join(SNPEFF_DIR, 'snpEff_latest_core.zip'))
-      unzip_zip(os.path.join(SNPEFF_DIR, 'snpEff_latest_core.zip'))
+      wget_file('https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip', os.path.join(HOME_DIR, 'snpEff_latest_core.zip'))
+      unzip_zip(os.path.join(HOME_DIR, 'snpEff_latest_core.zip'), HOME_DIR)
     logging.info(f"{SNPEFF_JAR}")    
       
     logging.info('Preparing annotation database')
@@ -543,9 +545,6 @@ def main():
     if not os.path.exists(contigs_file):
       create_contigs(REFERENCE_FA.replace(".gz", '.fai'), contigs_file)
       add_contigs(SNV_DIR, INDEL_DIR, contigs_file)
-  
-    logging.info("Parsing GTF file to get transcript coordinates")
-    transcript_df = parse_gtf(REFERENCE_GTF)
 
     # Process samples to get variant counts per transcript
     logging.info("Processing samples to count variants per transcript")

From 99115b50358228e8220cc2ff5906ab4d16577543 Mon Sep 17 00:00:00 2001
From: HReed1 <HarrisonHVReed@gmail.com>
Date: Sun, 6 Oct 2024 02:28:47 -0400
Subject: [PATCH 09/22] Only download database when downloading snpEff

---
 seq_to_pheno/tcga/set_tcga_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/seq_to_pheno/tcga/set_tcga_data.py b/seq_to_pheno/tcga/set_tcga_data.py
index ba7c6e5..f50cfce 100644
--- a/seq_to_pheno/tcga/set_tcga_data.py
+++ b/seq_to_pheno/tcga/set_tcga_data.py
@@ -532,10 +532,10 @@ def main():
     if not os.path.exists(SNPEFF_JAR):
       wget_file('https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip', os.path.join(HOME_DIR, 'snpEff_latest_core.zip'))
       unzip_zip(os.path.join(HOME_DIR, 'snpEff_latest_core.zip'), HOME_DIR)
+      logging.info('Preparing annotation database')
+      download_annotation_database(SNPEFF_JAR, REFERENCE_SHORT)
     logging.info(f"{SNPEFF_JAR}")    
       
-    logging.info('Preparing annotation database')
-    download_annotation_database(SNPEFF_JAR, REFERENCE_SHORT)
     
     logging.info("Preparing reference genome")
     prepare_reference(REFERENCE_FA)

From 4f681b00aeb7158e02f580f77520679a30952736 Mon Sep 17 00:00:00 2001
From: HReed1 <HarrisonHVReed@gmail.com>
Date: Sun, 6 Oct 2024 13:40:17 -0400
Subject: [PATCH 10/22] updates requirements

---
 seq_to_pheno/tcga/requirements.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/seq_to_pheno/tcga/requirements.txt b/seq_to_pheno/tcga/requirements.txt
index 9588b42..39fe82a 100644
--- a/seq_to_pheno/tcga/requirements.txt
+++ b/seq_to_pheno/tcga/requirements.txt
@@ -1,2 +1,7 @@
-pandas
 collections
+pandas
+pysam
+intervaltree
+biopython
+requests
+tqdm

From 96c0df3d673fd954373b7100ff447463a33fe5cf Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sun, 6 Oct 2024 14:11:22 -0400
Subject: [PATCH 11/22] Add step to generate wildtype transcripts as well

---
 seq_to_pheno/tcga/get_sequences.py | 81 +++++++++++++++++-------------
 1 file changed, 46 insertions(+), 35 deletions(-)

diff --git a/seq_to_pheno/tcga/get_sequences.py b/seq_to_pheno/tcga/get_sequences.py
index 776a79f..eb3a0bc 100644
--- a/seq_to_pheno/tcga/get_sequences.py
+++ b/seq_to_pheno/tcga/get_sequences.py
@@ -223,19 +223,19 @@ def save_protein_sequence_to_fasta(sequence, protein_id, output_file):
             f.write(sequence[i:i+60] + '\n')
 
 def process_sample(args):
-    vcf_file, output_dir, sample_id = args
-    process_vcf(vcf_file, output_dir, sample_id)
+    vcf_file, output_dir, sample_id, wildtype_output_dir = args
+    process_vcf(vcf_file, output_dir, sample_id, wildtype_output_dir)
 
-def process_vcf_directory(vcf_dir, output_dir):
+def process_vcf_directory(vcf_dir, output_dir, wildtype_output_dir):
     tasks = []
     for filename in os.listdir(vcf_dir):
         if filename.endswith('.vcf') or filename.endswith('.vcf.gz'):
             sample_id = filename.replace('.combined.vcf', '').replace('.vcf', '').replace('.gz', '')
             vcf_file = os.path.join(vcf_dir, filename)
-            tasks.append((vcf_file, output_dir, sample_id))
+            tasks.append((vcf_file, output_dir, sample_id, wildtype_output_dir))
     return tasks
 
-def process_vcf(vcf_file, output_dir, sample_id):
+def process_vcf(vcf_file, output_dir, sample_id, wildtype_output_dir):
     vcf_in = pysam.VariantFile(vcf_file)
     transcript_variants = {}
     failed_variants = []
@@ -296,38 +296,47 @@ def process_vcf(vcf_file, output_dir, sample_id):
             logging.warning(f"CDS sequence not found for transcript {transcript_id}")
             continue
 
-        # Sort variants by cDNA position (descending)
-        def extract_cdna_position(hgvs_c):
-            match = re.match(r'c\.([\d+]+)', hgvs_c)
-            if match:
-                return int(match.group(1))
-            else:
-                logging.warning(f"Could not extract cDNA position from {hgvs_c}")
-                return float('inf')
-
-        variants.sort(key=lambda x: extract_cdna_position(x[0]), reverse=True)
-
-        mutated_cds_sequence = cds_sequence
-        for hgvs_c, _ in variants:
-            try:
-                mutated_cds_sequence = apply_variant_to_cds(mutated_cds_sequence, hgvs_c)
-            except NotImplementedError as e:
-                logging.warning(f"Variant not implemented {hgvs_c} for transcript {transcript_id}: {e}")
-                failed_variants.append((transcript_id, hgvs_c, str(e)))
-                continue
-            except Exception as e:
-                logging.error(f"Error applying variant {hgvs_c} to transcript {transcript_id}: {e}")
-                failed_variants.append((transcript_id, hgvs_c, str(e)))
-                continue
-
-        # Translate the mutated CDS into protein sequence
-        protein_sequence = translate_cds_to_protein(mutated_cds_sequence)
+        # Save wildtype protein sequence if not already saved
+        wildtype_protein_file = os.path.join(wildtype_output_dir, f"{transcript_id}.fasta")
+        if not os.path.exists(wildtype_protein_file):
+            wildtype_protein_sequence = translate_cds_to_protein(cds_sequence)
+            save_protein_sequence_to_fasta(wildtype_protein_sequence, transcript_id, wildtype_protein_file)
+            logging.info(f"Saved wildtype protein sequence for {transcript_id} to {wildtype_protein_file}")
 
-        # Save the mutated protein sequence to a FASTA file
         protein_id = f"{sample_id}_{transcript_id}_mutated"
         output_file = os.path.join(output_dir, f"{protein_id}.fasta")
-        save_protein_sequence_to_fasta(protein_sequence, protein_id, output_file)
-        logging.info(f"Saved mutated protein sequence for {transcript_id} to {output_file}")
+        
+        if not os.path.exists(output_file):
+          # Sort variants by cDNA position (descending)
+          def extract_cdna_position(hgvs_c):
+              match = re.match(r'c\.([\d+]+)', hgvs_c)
+              if match:
+                  return int(match.group(1))
+              else:
+                  logging.warning(f"Could not extract cDNA position from {hgvs_c}")
+                  return float('inf')
+
+          variants.sort(key=lambda x: extract_cdna_position(x[0]), reverse=True)
+
+          mutated_cds_sequence = cds_sequence
+          for hgvs_c, _ in variants:
+              try:
+                  mutated_cds_sequence = apply_variant_to_cds(mutated_cds_sequence, hgvs_c)
+              except NotImplementedError as e:
+                  logging.warning(f"Variant not implemented {hgvs_c} for transcript {transcript_id}: {e}")
+                  failed_variants.append((transcript_id, hgvs_c, str(e)))
+                  continue
+              except Exception as e:
+                  logging.error(f"Error applying variant {hgvs_c} to transcript {transcript_id}: {e}")
+                  failed_variants.append((transcript_id, hgvs_c, str(e)))
+                  continue
+        
+          # Translate the mutated CDS into protein sequence
+          protein_sequence = translate_cds_to_protein(mutated_cds_sequence)
+
+          # Save the mutated protein sequence to a FASTA file
+          save_protein_sequence_to_fasta(protein_sequence, protein_id, output_file)
+          logging.info(f"Saved mutated protein sequence for {transcript_id} to {output_file}")
 
     # Optionally, write failed variants to a log file
     if failed_variants:
@@ -351,9 +360,11 @@ def extract_cdna_position(hgvs_c):
 
     vcf_dir = 'seq_to_pheno/tcga/data/variants/vcf'
     output_dir = 'seq_to_pheno/tcga/data/variants/mutated_proteins'
+    wildtype_output_dir = 'seq_to_pheno/tcga/data/variants/wildtype_proteins'
     os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(wildtype_output_dir, exist_ok=True)
 
-    tasks = process_vcf_directory(vcf_dir, output_dir)
+    tasks = process_vcf_directory(vcf_dir, output_dir, wildtype_output_dir)
 
     # Start the multiprocessing executor
     with ProcessPoolExecutor(max_workers=4) as executor:

From f37a751019015dc8d44165a1b0389d11b2c99486 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sun, 6 Oct 2024 14:12:00 -0400
Subject: [PATCH 12/22] Updating readme with system requirements and python
 packages

---
 seq_to_pheno/tcga/README.md        | 23 +++++++++++++++++++++++
 seq_to_pheno/tcga/requirements.txt |  2 +-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/seq_to_pheno/tcga/README.md b/seq_to_pheno/tcga/README.md
index f8ce219..740a29e 100644
--- a/seq_to_pheno/tcga/README.md
+++ b/seq_to_pheno/tcga/README.md
@@ -7,6 +7,29 @@ The ICGC (International Cancer Genome Consortium) in collaboration with TCGA and
 
 Instructions on how to access the data are [found on this page](https://docs.icgc-argo.org/docs/data-access/icgc-25k-data).
 
+## Requirements
+#### For you Shell environment
+```
+wget
+unzip
+java (openjdk-23)
+tabix
+bcftools
+samtools
+git
+```
+
+#### Python packages:
+```
+collections
+pandas
+pysam
+intervaltree
+biopython
+requests
+tqdm
+```
+
 ## ETL Overview
 **get normalized transcript expression data**
 
diff --git a/seq_to_pheno/tcga/requirements.txt b/seq_to_pheno/tcga/requirements.txt
index 39fe82a..842efa9 100644
--- a/seq_to_pheno/tcga/requirements.txt
+++ b/seq_to_pheno/tcga/requirements.txt
@@ -4,4 +4,4 @@ pysam
 intervaltree
 biopython
 requests
-tqdm
+tqdm
\ No newline at end of file

From 908d09d9247a66926b5b2494e9983661356d503e Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sun, 6 Oct 2024 18:17:18 -0400
Subject: [PATCH 13/22] Making sure we pull the correct reference genome from
 the ensembl REST-api

---
 seq_to_pheno/tcga/get_sequences.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seq_to_pheno/tcga/get_sequences.py b/seq_to_pheno/tcga/get_sequences.py
index eb3a0bc..1b4140d 100644
--- a/seq_to_pheno/tcga/get_sequences.py
+++ b/seq_to_pheno/tcga/get_sequences.py
@@ -23,7 +23,7 @@
 def fetch_transcript_cds(transcript_id):
     if transcript_id in transcript_cds_cache:
         return transcript_cds_cache[transcript_id]
-    server = "https://rest.ensembl.org"
+    server = "https://grch37.rest.ensembl.org"
     ext = f"/sequence/id/{transcript_id}?type=cds"
     headers = {"Content-Type": "text/plain"}
 

From 1a44c46ef86a01ceb6206c56876ca2437ca7cfb9 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sun, 6 Oct 2024 19:35:05 -0400
Subject: [PATCH 14/22] Protein metadata table created -- yayy

---
 .../tcga/create_protein_metadata_table.py     | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 seq_to_pheno/tcga/create_protein_metadata_table.py

diff --git a/seq_to_pheno/tcga/create_protein_metadata_table.py b/seq_to_pheno/tcga/create_protein_metadata_table.py
new file mode 100644
index 0000000..27d6264
--- /dev/null
+++ b/seq_to_pheno/tcga/create_protein_metadata_table.py
@@ -0,0 +1,122 @@
+import os
+import pandas as pd
+import argparse
+
+def read_fasta_sequence(fasta_path):
+    """
+    Reads a FASTA file and returns the sequence as a string.
+    Assumes a single sequence per file.
+    """
+    sequence = ''
+    with open(fasta_path, 'r') as f:
+        for line in f:
+            if line.startswith('>'):
+                continue  # Skip header lines
+            sequence += line.strip()
+    return sequence
+
+def main():
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(description='Create protein sequences metadata table.')
+    parser.add_argument('--include-sequences', action='store_true',
+                        help='Include actual protein sequences in the output instead of file paths.')
+    args = parser.parse_args()
+
+    # Paths to directories
+    mutated_proteins_dir = 'seq_to_pheno/tcga/data/variants/mutated_proteins'
+    wildtype_proteins_dir = 'seq_to_pheno/tcga/data/variants/wildtype_proteins'
+    metadata_file = 'seq_to_pheno/tcga/data/rnaseq.extended.metadata.aliquot_id.V4.tsv.gz'
+    output_file = 'seq_to_pheno/tcga/data/protein_sequences_metadata.tsv'
+
+    # Read the metadata file into a DataFrame
+    metadata_df = pd.read_csv(metadata_file, sep='\t', compression='gzip', low_memory=False)
+
+    # List all mutated protein FASTA files
+    mutated_files = [f for f in os.listdir(mutated_proteins_dir) if f.endswith('.fasta')]
+
+    data_rows = []
+
+    for filename in mutated_files:
+        # Extract sample ID and transcript ID from filename
+        # Filename format: {sample_id}_{transcript_id}_mutated.fasta
+        parts = filename.split('_')
+        if len(parts) < 3:
+            print(f"Filename {filename} does not match expected format.")
+            continue
+
+        sample_id = parts[0]
+        transcript_id = parts[1]
+        
+        # Paths to the mutated and wildtype protein files
+        mutated_protein_path = os.path.join(mutated_proteins_dir, filename)
+        wildtype_protein_filename = f"{transcript_id}.fasta"
+        wildtype_protein_path = os.path.join(wildtype_proteins_dir, wildtype_protein_filename)
+        
+        # Check if the wildtype protein file exists
+        if not os.path.exists(wildtype_protein_path):
+            print(f"Wildtype protein file {wildtype_protein_filename} not found.")
+            continue
+
+        # Get metadata for the sample
+        sample_metadata = metadata_df[metadata_df['wgs_aliquot_id'] == sample_id]
+
+        if sample_metadata.empty:
+            print(f"No metadata found for sample {sample_id}.")
+            continue
+
+        # Extract required metadata fields
+        # Ensure to handle missing data
+        sample_metadata = sample_metadata.iloc[0]  # Get the first matching row
+
+        # Columns to extract
+        columns_to_extract = {
+            'aliquot_id': 'aliquot_id',
+            'wgs_aliquot_id': 'wgs_aliquot_id',
+            'Cancer Type': 'study',
+            'Cancer Stage': 'tumour_stage',
+            'Donor Survival Time': 'donor_survival_time',
+            'Donor Vital Status': 'donor_vital_status',
+            'Donor Age at Diagnosis': 'donor_age_at_diagnosis',
+            'Tumour Grade': 'tumour_grade',
+            'Donor Sex': 'donor_sex',
+            'Histology Abbreviation': 'histology_abbreviation',
+            # Add more fields as needed
+        }
+
+        # Build a dictionary for the row
+        row = {
+            'aliquot_id': sample_id,
+        }
+
+        # Include sequences or file paths based on the argument
+        if args.include_sequences:
+            # Read the sequences from the FASTA files
+            mutated_sequence = read_fasta_sequence(mutated_protein_path)
+            wildtype_sequence = read_fasta_sequence(wildtype_protein_path)
+            row['mutated_protein_sequence'] = mutated_sequence
+            row['wildtype_protein_sequence'] = wildtype_sequence
+        else:
+            # Include the file paths
+            row['mutated_protein_path'] = mutated_protein_path
+            row['wildtype_protein_path'] = wildtype_protein_path
+
+        for col_name, meta_col in columns_to_extract.items():
+            value = sample_metadata.get(meta_col, '')
+            row[col_name] = value
+
+        data_rows.append(row)
+
+        # If including sequences and we have collected 50 rows, stop
+        # if args.include_sequences and len(data_rows) >= 50:
+            # break
+
+    # Create a DataFrame from the collected rows
+    result_df = pd.DataFrame(data_rows)
+
+    # Save the DataFrame to a TSV file
+    result_df.to_csv(output_file, index=False, sep='\t')
+
+    print(f"Metadata table saved to {output_file}")
+
+if __name__ == '__main__':
+    main()

From a1b26ea773a8270a7a24caf1b713450108650375 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sun, 6 Oct 2024 20:31:26 -0400
Subject: [PATCH 15/22] Updated readme

---
 seq_to_pheno/tcga/README.md                   | 132 +++++++++++++++++-
 .../tcga/create_protein_metadata_table.py     |   4 -
 2 files changed, 130 insertions(+), 6 deletions(-)

diff --git a/seq_to_pheno/tcga/README.md b/seq_to_pheno/tcga/README.md
index 740a29e..f7abe7d 100644
--- a/seq_to_pheno/tcga/README.md
+++ b/seq_to_pheno/tcga/README.md
@@ -129,12 +129,140 @@ All of the steps above can be run using ```set_tcga_data.py``` **just make sure
 
 > NOTE: This script will take a long time to run, it might make sense to subset the list to only a handful of subjects first for downstream testing purposes.
 
+Once finished we can now see the **ANN** tag now added to the info field on our VCF files:
+
+In the header:
+```
+##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'">
+```
+
+Example VCF line:
+```
+#CHROM  POS ID  REF ALT QUAL    FILTER  INFO
+1   1273949 .   A   G   .   .   Callers=broad,dkfz,muse,sanger;NumCallers=4;VAF=0.3182;cosmic=COSM4909256;t_alt_count=28;t_ref_count=60;Variant_Classification=Missense_Mutation;ANN=G|missense_variant|MODERATE|DVL1|ENSG00000107404|transcript|ENST00000378888|protein_coding|12/15|c.1292T>C|p.Ile431Thr|1577/3239|1292/2088|431/695||,G|missense_variant|MODERATE|DVL1|ENSG00000107404|transcript|ENST00000378891|protein_coding|12/15|c.1217T>C|p.Ile406Thr|1264/2926|1217/2013|406/670||,G|downstream_gene_variant|MODIFIER|TAS1R3|ENSG00000169962|transcript|ENST00000339381|protein_coding||c.*4105A>G|||||3263|,G|downstream_gene_variant|MODIFIER|DVL1|ENSG00000107404|transcript|ENST00000472445|retained_intron||n.*3201T>C|||||3201|
+```
+
+
 ### Merge Annotations
 Once we have successfully preprocessed our annotations, we will need to combine the indels and snvs into a single vcf file. 
 
 We can run this using ```merge_annotations.py```
 
+This will create the directory ```seq_to_pheno/tcga/data/variants/vcf/``` that will contain our combined annotate vcf.
+
 ### Get sequences
-We can then take these filtered and merged variant calls and then generate protein sequences based on the variants found. It will account for all overlapping variants in a transcript and create a protein-fasta file of the resulting protein-sequence. 
+We can now take these filtered and merged variant calls and then generate protein sequences based on the variants found. It will account for all overlapping variants in a transcript and create a protein-fasta file of the resulting protein-sequence. 
+
+We can run this using ```get_sequences.py```
+
+This will create two new directories:
+
+```
+seq_to_pheno/tcga/data/variants/mutated_proteins
+seq_to_pheno/tcga/data/variants/wildtype_proteins/
+```
+
+These are dirrectories that will contain the various protein-fasta sequences of our mutated proteins as well as their wildtype counterparts.
+
+Mutant transcripts
+```
+➜ ls seq_to_pheno/tcga/data/variants/mutated_proteins | head
+0009b464-b376-4fbc-8a56-da538269a02f_ENST00000003084_mutated.fasta 
+0009b464-b376-4fbc-8a56-da538269a02f_ENST00000020673_mutated.fasta 
+0009b464-b376-4fbc-8a56-da538269a02f_ENST00000046087_mutated.fasta 
+0009b464-b376-4fbc-8a56-da538269a02f_ENST00000192314_mutated.fasta 
+0009b464-b376-4fbc-8a56-da538269a02f_ENST00000228327_mutated.fasta 
+0009b464-b376-4fbc-8a56-da538269a02f_ENST00000229030_mutated.fasta 
+0009b464-b376-4fbc-8a56-da538269a02f_ENST00000230859_mutated.fasta 
+0009b464-b376-4fbc-8a56-da538269a02f_ENST00000245255_mutated.fasta 
+0009b464-b376-4fbc-8a56-da538269a02f_ENST00000248058_mutated.fasta 
+0009b464-b376-4fbc-8a56-da538269a02f_ENST00000253001_mutated.fasta
+
+```
+
+In mutated_proteins you will also find log files for variants that passed filtration but coulnd't be included in the protein transcript for one reason or another:
+
+```
+➜ ls seq_to_pheno/tcga/data/variants/mutated_proteins/*failed* | head
+0009b464-b376-4fbc-8a56-da538269a02f_failed_variants.log 
+005794f1-5a87-45b5-9811-83ddf6924568_failed_variants.log 
+00b9d0e6-69dc-4345-bffd-ce32880c8eef_failed_variants.log 
+00db1b95-8ca3-4cc4-bb46-6b8c8019a7c7_failed_variants.log 
+0168a2a6-c3af-4d58-a51c-d33f0fc7876d_failed_variants.log 
+02917220-6a7a-46a1-8656-907e96bef88e_failed_variants.log 
+03ad38a6-0902-4aaa-84a3-91ea88fa9883_failed_variants.log 
+03c3c692-8a86-4843-85ae-e045f0fa6f88_failed_variants.log 
+046d7386-95c8-4501-9e55-c85bec272a7a_failed_variants.log 
+04b570c2-3224-4e9b-81cc-089b4a7ff07a_failed_variants.log
+```
+
+>**NOTE**: Although these errors represent a small minority of all variant calls we process here -- these instances will need to be investigated eventually. Most likely, there is some mismatch with the reference used to create this call vs the one we've used so far. This is likely due to an upstream issue inherenet to the dataset, but that will have to be proven.
+
+
+Wildtype transcripts:
+
+```
+➜ ls seq_to_pheno/tcga/data/variants/wildtype_proteins | head 
+ENST00000001008.fasta 
+ENST00000001146.fasta 
+ENST00000002125.fasta 
+ENST00000002165.fasta 
+ENST00000002596.fasta 
+ENST00000002829.fasta 
+ENST00000003084.fasta 
+ENST00000003302.fasta 
+ENST00000003583.fasta 
+ENST00000004103.fasta
+
+```
+
+### Protein Metadata Construction
+
+Okay, now that we have all of our sequences generated we must them back to sample metadata. 
+
+For this, I have created ```create_protein_metadata_table.py```
+
+This script will create a table that will create a unique row for each mutated_protein we've generated and with columns that contain the related wildtype_protein as well as important metadata related to the indivdual that whoose tumor transcribed the mutant protein. 
+
+This script can be run to only include relative paths to the proteins:
+
+```
+➜ python seq-to-pheno/seq_to_pheno/tcga/create_protein_metadata_table.py
+
+Metadata table saved to seq_to_pheno/tcga/data/protein_sequences_metadata.tsv
+```
+
+Inside table:
+```
+aliquot_id      mutated_protein_path    wildtype_protein_path   wgs_aliquot_id  Cancer Type     Cancer Stage    Donor Survival Time     Donor Vital Status      Donor Age at Diagnosis Tumour Grade    Donor Sex       Histology Abbreviation
+8fb9496e-ddb8-11e4-ad8f-5ed8e2d07381    seq_to_pheno/tcga/data/variants/mutated_proteins/80ab6c08-c622-11e3-bf01-24c6515278c0_ENST00000512632_mutated.fasta     seq_to_pheno/tcga/data/variants/wildtype_proteins/ENST00000512632.fasta        80ab6c08-c622-11e3-bf01-24c6515278c0    Liver Cancer - RIKEN, JP        2       1440.0  deceased       67.0    I       male    Liver-HCC
+```
+
+The script can also be run to include the actual mutant and wildtype transcripts in the table as well by using the ```-include-sequences``` flag:
+
+```
+➜ python seq-to-pheno/seq_to_pheno/tcga/create_protein_metadata_table.py -include-sequences
+Metadata table saved to seq_to_pheno/tcga/data/protein_sequences_metadata.tsv
+```
+
+Inside table:
+```
+aliquot_id      mutated_protein_path    wildtype_protein_path   wgs_aliquot_id  Cancer Type     Cancer Stage    Donor Survival Time     Donor Vital Status      Donor Age at Diagnosis Tumour Grade    Donor Sex       Histology Abbreviation
+8fb6f8bc-ddb8-11e4-ad8f-5ed8e2d07381    MLTRLQVLTLALFSKGFLLSLGDHNFLRREIKIEGDLVLGGLFPINEKGTGTEECGRINEDRGIQRLEAMLFAIDEINKDDYLLPGVKLSVHILDTCSRDTYALEQSLEFVRASLTKVDEAEYMCPDGSYAIQENIPLLIAGVIGGSYSSVSIQVANLLRLFQIPQISYASTSAKLSDKSRYDYFARTVPPDFYQAKAMAEILRFFNWTYVSTVASEGDYGETGIEAFEQEARLRNICIATAEKVGRSNIRKSYDSVIRELLQKPNARVVVLFMRSDDSRELIAAASRANASFTWVASDGWGAQESIIKGSEHVAYGAITLELASQPVRQFDRYFQSLNPYNNHRNPWFRDFWEQKFQCSLQNKRNHRRVCDKHLAIDSSNYEQESKIMFVVNAVYAMAHALHKMQRTLCPNTTKLCDAMKILDGKKLYKDYLLKINFTGADDNHVHLCQPEWLCGLGLFVCTQGSHHPVSTPEECCHTQTAPQQVQCQWNWDHILSVLCKHVCANGVQWAGSPRLHHLISVIVNCSSVLVFLDC*     MLTRLQVLTLALFSKGFLLSLGDHNFLRREIKIEGDLVLGGLFPINEKGTGTEECGRINEDRGIQRLEAMLFAIDEINKDDYLLPGVKLGVHILDTCSRDTYALEQSLEFVRASLTKVDEAEYMCPDGSYAIQENIPLLIAGVIGGSYSSVSIQVANLLRLFQIPQISYASTSAKLSDKSRYDYFARTVPPDFYQAKAMAEILRFFNWTYVSTVASEGDYGETGIEAFEQEARLRNICIATAEKVGRSNIRKSYDSVIRELLQKPNARVVVLFMRSDDSRELIAAASRANASFTWVASDGWGAQESIIKGSEHVAYGAITLELASQPVRQFDRYFQSLNPYNNHRNPWFRDFWEQKFQCSLQNKRNHRRVCDKHLAIDSSNYEQESKIMFVVNAVYAMAHALHKMQRTLCPNTTKLCDAMKILDGKKLYKDYLLKINFTGADDNHVHLCQPEWLCGLGLFVCTQGSHHPVSTPEECCHTQTAPQQVQCQWNWDHILSVLCKHVCANGVQWAGSPRLHHLISVIVNCSSVLVFLDC*     15fd8dc8-c622-11e3-bf01-24c6515278c0    Liver Cancer - RIKEN, JP       4       1200.0  deceased        62.0    III     male    Liver-HCC
+```
+
+## Next Steps
+
+With this table we are at the point where we are very close to being ready to generate embeddings. We just need to create a way to subset the table intelligentl.
+
+### Example Analysis:
 
-We can run this using ```get_sequences.py```
\ No newline at end of file
+- Load the seq_to_pheno/tcga/data/protein_sequences_metadata.tsv table into a dataframe
+- Filter for a cancer type (Liver Cancer)
+- Filter for a certain cancer stage (2 or T3N0MX)
+- Order table by donor survival time
+- Generate embeddings of wildtype and mutant transcripts.
+- Experiment: 
+  - Wildtype embeddings should cluster separately from mutant embeddings.
+  - Mutant embeddings should cluster based on the extremes of donor survival time. 
+  - Transcripts with the highest relational effect should show patterns across all/most subjects.
\ No newline at end of file
diff --git a/seq_to_pheno/tcga/create_protein_metadata_table.py b/seq_to_pheno/tcga/create_protein_metadata_table.py
index 27d6264..2001691 100644
--- a/seq_to_pheno/tcga/create_protein_metadata_table.py
+++ b/seq_to_pheno/tcga/create_protein_metadata_table.py
@@ -106,10 +106,6 @@ def main():
 
         data_rows.append(row)
 
-        # If including sequences and we have collected 50 rows, stop
-        # if args.include_sequences and len(data_rows) >= 50:
-            # break
-
     # Create a DataFrame from the collected rows
     result_df = pd.DataFrame(data_rows)
 

From 9f90c0b4b4e36783e9a76292e1175df0568734d2 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sun, 6 Oct 2024 20:38:41 -0400
Subject: [PATCH 16/22] A little extra for the README

---
 seq_to_pheno/tcga/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/seq_to_pheno/tcga/README.md b/seq_to_pheno/tcga/README.md
index f7abe7d..f04d9fb 100644
--- a/seq_to_pheno/tcga/README.md
+++ b/seq_to_pheno/tcga/README.md
@@ -265,4 +265,6 @@ With this table we are at the point where we are very close to being ready to ge
 - Experiment: 
   - Wildtype embeddings should cluster separately from mutant embeddings.
   - Mutant embeddings should cluster based on the extremes of donor survival time. 
-  - Transcripts with the highest relational effect should show patterns across all/most subjects.
\ No newline at end of file
+  - Transcripts with the highest relational effect should show patterns across all/most subjects.
+
+Don't feel committed to this exact example analysis. There are probably many ways we can create a reasonable and reproducable experiment. 
\ No newline at end of file

From cf4c073bdbb68a68bed06fc5124a46f443b1ee88 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sun, 6 Oct 2024 21:40:57 -0400
Subject: [PATCH 17/22] Including TRANSCRIPT_ID in the protein metadata table

---
 seq_to_pheno/tcga/create_protein_metadata_table.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/seq_to_pheno/tcga/create_protein_metadata_table.py b/seq_to_pheno/tcga/create_protein_metadata_table.py
index 2001691..b3c14c1 100644
--- a/seq_to_pheno/tcga/create_protein_metadata_table.py
+++ b/seq_to_pheno/tcga/create_protein_metadata_table.py
@@ -86,6 +86,7 @@ def main():
         # Build a dictionary for the row
         row = {
             'aliquot_id': sample_id,
+            'transcript_id': transcript_id
         }
 
         # Include sequences or file paths based on the argument

From 14812226eaef1f3a1c13c7ce5dee378986ab2378 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Sun, 6 Oct 2024 22:25:33 -0400
Subject: [PATCH 18/22] minor cleanups

---
 seq_to_pheno/tcga/create_protein_metadata_table.py | 8 ++++----
 seq_to_pheno/tcga/get_sequences.py                 | 2 +-
 seq_to_pheno/tcga/set_tcga_data.py                 | 2 --
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/seq_to_pheno/tcga/create_protein_metadata_table.py b/seq_to_pheno/tcga/create_protein_metadata_table.py
index b3c14c1..b06eb95 100644
--- a/seq_to_pheno/tcga/create_protein_metadata_table.py
+++ b/seq_to_pheno/tcga/create_protein_metadata_table.py
@@ -94,12 +94,12 @@ def main():
             # Read the sequences from the FASTA files
             mutated_sequence = read_fasta_sequence(mutated_protein_path)
             wildtype_sequence = read_fasta_sequence(wildtype_protein_path)
-            row['mutated_protein_sequence'] = mutated_sequence
-            row['wildtype_protein_sequence'] = wildtype_sequence
+            row['mutated_protein'] = mutated_sequence
+            row['wildtype_protein'] = wildtype_sequence
         else:
             # Include the file paths
-            row['mutated_protein_path'] = mutated_protein_path
-            row['wildtype_protein_path'] = wildtype_protein_path
+            row['mutated_protein'] = mutated_protein_path
+            row['wildtype_protein'] = wildtype_protein_path
 
         for col_name, meta_col in columns_to_extract.items():
             value = sample_metadata.get(meta_col, '')
diff --git a/seq_to_pheno/tcga/get_sequences.py b/seq_to_pheno/tcga/get_sequences.py
index 1b4140d..f51c21d 100644
--- a/seq_to_pheno/tcga/get_sequences.py
+++ b/seq_to_pheno/tcga/get_sequences.py
@@ -10,6 +10,7 @@
 
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
+session = requests.Session()
 
 # Initialize a cache for CDS sequences
 if os.path.exists('cds_cache.pkl'):
@@ -18,7 +19,6 @@
 else:
     transcript_cds_cache = {}
 
-session = requests.Session()
 
 def fetch_transcript_cds(transcript_id):
     if transcript_id in transcript_cds_cache:
diff --git a/seq_to_pheno/tcga/set_tcga_data.py b/seq_to_pheno/tcga/set_tcga_data.py
index f50cfce..a09ca52 100644
--- a/seq_to_pheno/tcga/set_tcga_data.py
+++ b/seq_to_pheno/tcga/set_tcga_data.py
@@ -412,7 +412,6 @@ def process_all_samples(metadata_df: pd.DataFrame, variant_dir: str, snpeff_jar:
 
     num_workers = os.cpu_count() - 1 or 1
     all_variant_counts = {}
-
     with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
         futures = {executor.submit(process_sample, args): args[0] for args in args_list}
         for future in concurrent.futures.as_completed(futures):
@@ -536,7 +535,6 @@ def main():
       download_annotation_database(SNPEFF_JAR, REFERENCE_SHORT)
     logging.info(f"{SNPEFF_JAR}")    
       
-    
     logging.info("Preparing reference genome")
     prepare_reference(REFERENCE_FA)
 

From c596eed9d6811e12d6de794e1e9fcbbe8f63b283 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Mon, 7 Oct 2024 00:22:52 -0400
Subject: [PATCH 19/22] Protein regressor

---
 .../tcga/protein_transformer_example.py       | 270 ++++++++++++++++++
 1 file changed, 270 insertions(+)
 create mode 100644 seq_to_pheno/tcga/protein_transformer_example.py

diff --git a/seq_to_pheno/tcga/protein_transformer_example.py b/seq_to_pheno/tcga/protein_transformer_example.py
new file mode 100644
index 0000000..b2fd9e9
--- /dev/null
+++ b/seq_to_pheno/tcga/protein_transformer_example.py
@@ -0,0 +1,270 @@
+import torch
+import transformers
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from transformers import BertTokenizer, BertModel
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+import numpy as np
+import logging
+import os
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Load the dataset with error handling
+try:
+    df = pd.read_csv('seq_to_pheno/tcga/data/protein_sequences_metadata.tsv', sep='\t')
+    logger.info("Dataset loaded successfully.")
+except FileNotFoundError as e:
+    logger.error(f"File not found: {e}")
+    raise
+except pd.errors.EmptyDataError as e:
+    logger.error(f"Empty data error: {e}")
+    raise
+except Exception as e:
+    logger.error(f"An error occurred while loading the dataset: {e}")
+    raise
+
+# Extract relevant columns
+try:
+    sequences = df['mutated_protein'].values
+    transcript_ids = df['transcript_id'].values
+    survival_times = df['Donor Survival Time'].values
+    logger.info("Relevant columns extracted successfully.")
+except KeyError as e:
+    logger.error(f"Key error: {e}")
+    raise
+
+# Split the dataset into training and validation sets
+try:
+    train_sequences, val_sequences, train_labels, val_labels, train_ids, val_ids = train_test_split(
+        sequences, survival_times, transcript_ids, test_size=0.2, random_state=42
+    )
+    logger.info("Dataset split into training and validation sets.")
+except Exception as e:
+    logger.error(f"An error occurred during dataset splitting: {e}")
+    raise
+
+# Define custom dataset class
+class ProteinDataset(Dataset):
+    def __init__(self, sequences, labels, tokenizer, max_length):
+        self.sequences = sequences
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def __len__(self):
+        return len(self.sequences)
+
+    def __getitem__(self, idx):
+        sequence = self.sequences[idx]
+        label = self.labels[idx]
+        
+        try:
+            # Tokenize the sequence
+            encoding = self.tokenizer(
+                sequence,
+                add_special_tokens=True,
+                max_length=self.max_length,
+                padding='max_length',
+                truncation=True,
+                return_tensors='pt'
+            )
+        except Exception as e:
+            logger.error(f"An error occurred during tokenization: {e}")
+            raise
+
+        input_ids = encoding['input_ids'].squeeze()
+        attention_mask = encoding['attention_mask'].squeeze()
+
+        return {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'labels': torch.tensor(label, dtype=torch.float)
+        }
+
+# Initialize the tokenizer
+try:
+    tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert')
+    logger.info("Tokenizer initialized.")
+except Exception as e:
+    logger.error(f"An error occurred while initializing the tokenizer: {e}")
+    raise
+
+# Define hyperparameters
+MAX_LENGTH = 512
+BATCH_SIZE = 8
+LEARNING_RATE = 1e-5
+EPOCHS = 5
+
+# Create datasets and dataloaders
+train_dataset = ProteinDataset(train_sequences, train_labels, tokenizer, MAX_LENGTH)
+val_dataset = ProteinDataset(val_sequences, val_labels, tokenizer, MAX_LENGTH)
+
+train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
+
+# Define the model
+class ProteinBERTRegressor(nn.Module):
+    def __init__(self):
+        super(ProteinBERTRegressor, self).__init__()
+        try:
+            self.bert = BertModel.from_pretrained('Rostlab/prot_bert')
+            self.regressor = nn.Linear(self.bert.config.hidden_size, 1)
+            logger.info("Model initialized.")
+        except Exception as e:
+            logger.error(f"An error occurred while initializing the model: {e}")
+            raise
+
+    def forward(self, input_ids, attention_mask):
+        try:
+            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+            pooler_output = outputs.pooler_output
+            output = self.regressor(pooler_output)
+            return output
+        except Exception as e:
+            logger.error(f"An error occurred during forward pass: {e}")
+            raise
+
+# Initialize the model, loss function, and optimizer
+device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
+model = ProteinBERTRegressor().to(device)
+criterion = nn.MSELoss()
+optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
+
+# Training loop
+train_losses = []
+val_losses = []
+for epoch in range(EPOCHS):
+    model.train()
+    total_loss = 0
+    for batch in train_loader:
+        input_ids = batch['input_ids'].to(device)
+        attention_mask = batch['attention_mask'].to(device)
+        labels = batch['labels'].to(device)
+
+        optimizer.zero_grad()
+        try:
+            outputs = model(input_ids, attention_mask)
+            loss = criterion(outputs.squeeze(), labels)
+            loss.backward()
+            optimizer.step()
+        except Exception as e:
+            logger.error(f"An error occurred during training: {e}")
+            raise
+
+        total_loss += loss.item()
+
+    avg_train_loss = total_loss / len(train_loader)
+    train_losses.append(avg_train_loss)
+    logger.info(f"Epoch {epoch+1}/{EPOCHS}, Training Loss: {avg_train_loss:.4f}")
+
+    # Validation loop
+    model.eval()
+    val_loss = 0
+    val_predictions = []
+    val_labels_list = []
+    val_ids_list = []
+    with torch.no_grad():
+        for batch_idx, batch in enumerate(val_loader):
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            labels = batch['labels'].to(device)
+
+            try:
+                outputs = model(input_ids, attention_mask)
+                loss = criterion(outputs.squeeze(), labels)
+                val_loss += loss.item()
+
+                val_predictions.extend(outputs.squeeze().tolist())
+                val_labels_list.extend(labels.tolist())
+                val_ids_list.extend(val_ids[batch_idx * BATCH_SIZE:(batch_idx + 1) * BATCH_SIZE])
+            except Exception as e:
+                logger.error(f"An error occurred during validation: {e}")
+                raise
+
+    avg_val_loss = val_loss / len(val_loader)
+    val_losses.append(avg_val_loss)
+    logger.info(f"Epoch {epoch+1}/{EPOCHS}, Validation Loss: {avg_val_loss:.4f}")
+
+# Plot training and validation loss
+plt.plot(range(1, EPOCHS + 1), train_losses, label='Training Loss')
+plt.plot(range(1, EPOCHS + 1), val_losses, label='Validation Loss')
+plt.xlabel('Epochs')
+plt.ylabel('Loss')
+plt.title('Training and Validation Loss')
+plt.legend()
+plt.show()
+
+# Identify transcripts with highest and lowest relationship to Donor Survival Time
+sorted_indices = np.argsort(val_predictions)
+lowest_transcripts = [val_ids_list[i] for i in sorted_indices[:5]]
+highest_transcripts = [val_ids_list[i] for i in sorted_indices[-5:]]
+
+logger.info("Transcripts with lowest predicted survival times:")
+for transcript in lowest_transcripts:
+    logger.info(transcript)
+
+logger.info("\nTranscripts with highest predicted survival times:")
+for transcript in highest_transcripts:
+    logger.info(transcript)
+
+# Explanation for implementing the model after training
+# To implement the model after training, you can save the model state and use it for inference as follows:
+
+# Save the model
+try:
+    model_path = 'seq_to_pheno/tcga/data/protein_bert_regressor.pth'
+    torch.save(model.state_dict(), model_path)
+    logger.info("Model saved successfully.")
+except Exception as e:
+    logger.error(f"An error occurred while saving the model: {e}")
+    raise
+
+# Load the model for inference
+def load_model(model_path):
+    model = ProteinBERTRegressor()
+    try:
+        model.load_state_dict(torch.load(model_path, map_location=device))
+        model.to(device)
+        model.eval()
+        logger.info("Model loaded successfully.")
+    except Exception as e:
+        logger.error(f"An error occurred while loading the model: {e}")
+        raise
+    return model
+
+# Example of using the model for inference
+def predict(model, sequence):
+    try:
+        encoding = tokenizer(
+            sequence,
+            add_special_tokens=True,
+            max_length=MAX_LENGTH,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        )
+
+        input_ids = encoding['input_ids'].to(device)
+        attention_mask = encoding['attention_mask'].to(device)
+
+        with torch.no_grad():
+            output = model(input_ids, attention_mask)
+        return output.item()
+    except Exception as e:
+        logger.error(f"An error occurred during prediction: {e}")
+        raise
+
+# Load the model and make a prediction
+try:
+    model = load_model(model_path)
+    example_sequence = "MDDYMVLRMIGEGSFGRALLVQHESSNQMFAMKEIRLPKSFSNTQN"
+    predicted_survival_time = predict(model, example_sequence)
+    logger.info(f"Predicted Survival Time: {predicted_survival_time:.2f}")
+except Exception as e:
+    logger.error(f"An error occurred during model prediction: {e}")
\ No newline at end of file

From b9bf4cbffb938024843cc8fac988233907c9ba7c Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Mon, 7 Oct 2024 00:51:21 -0400
Subject: [PATCH 20/22] Making it so that it can run on my Macbook Pro 16GB ram
 and M1 Chip and 9 cpus

---
 seq_to_pheno/tcga/protein_transformer_example.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/seq_to_pheno/tcga/protein_transformer_example.py b/seq_to_pheno/tcga/protein_transformer_example.py
index b2fd9e9..588d066 100644
--- a/seq_to_pheno/tcga/protein_transformer_example.py
+++ b/seq_to_pheno/tcga/protein_transformer_example.py
@@ -96,8 +96,8 @@ def __getitem__(self, idx):
     raise
 
 # Define hyperparameters
-MAX_LENGTH = 512
-BATCH_SIZE = 8
+MAX_LENGTH = 256
+BATCH_SIZE = 2
 LEARNING_RATE = 1e-5
 EPOCHS = 5
 
@@ -138,11 +138,12 @@ def forward(self, input_ids, attention_mask):
 
 # Training loop
 train_losses = []
+gradient_accumulation_steps = 4
 val_losses = []
 for epoch in range(EPOCHS):
     model.train()
     total_loss = 0
-    for batch in train_loader:
+    for batch_idx, batch in enumerate(train_loader):
         input_ids = batch['input_ids'].to(device)
         attention_mask = batch['attention_mask'].to(device)
         labels = batch['labels'].to(device)
@@ -151,8 +152,12 @@ def forward(self, input_ids, attention_mask):
         try:
             outputs = model(input_ids, attention_mask)
             loss = criterion(outputs.squeeze(), labels)
+            loss = loss / gradient_accumulation_steps
             loss.backward()
-            optimizer.step()
+
+            if (batch_idx + 1) % gradient_accumulation_steps == 0:
+                optimizer.step()
+                optimizer.zero_grad()
         except Exception as e:
             logger.error(f"An error occurred during training: {e}")
             raise

From bb20c753ef3e07fa6887e5486d1d431eeb105192 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Mon, 7 Oct 2024 12:06:10 -0400
Subject: [PATCH 21/22] Some training optimizations

---
 .../tcga/protein_transformer_example.py       | 240 +++++++++---------
 1 file changed, 126 insertions(+), 114 deletions(-)

diff --git a/seq_to_pheno/tcga/protein_transformer_example.py b/seq_to_pheno/tcga/protein_transformer_example.py
index 588d066..483838f 100644
--- a/seq_to_pheno/tcga/protein_transformer_example.py
+++ b/seq_to_pheno/tcga/protein_transformer_example.py
@@ -10,6 +10,7 @@
 import numpy as np
 import logging
 import os
+import runpy
 
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -44,6 +45,14 @@
     train_sequences, val_sequences, train_labels, val_labels, train_ids, val_ids = train_test_split(
         sequences, survival_times, transcript_ids, test_size=0.2, random_state=42
     )
+    # Normalize survival times
+    train_labels_mean = np.mean(train_labels)
+    train_labels_std = np.std(train_labels)
+    train_labels = (train_labels - train_labels_mean) / train_labels_std
+    val_labels = (val_labels - train_labels_mean) / train_labels_std
+    # Normalize survival times
+    train_labels = (train_labels - np.mean(train_labels)) / np.std(train_labels)
+    val_labels = (val_labels - np.mean(train_labels)) / np.std(train_labels)
     logger.info("Dataset split into training and validation sets.")
 except Exception as e:
     logger.error(f"An error occurred during dataset splitting: {e}")
@@ -78,7 +87,7 @@ def __getitem__(self, idx):
             logger.error(f"An error occurred during tokenization: {e}")
             raise
 
-        input_ids = encoding['input_ids'].squeeze()
+        input_ids = encoding['input_ids'].squeeze(dim=0)
         attention_mask = encoding['attention_mask'].squeeze()
 
         return {
@@ -89,7 +98,7 @@ def __getitem__(self, idx):
 
 # Initialize the tokenizer
 try:
-    tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert')
+    tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
     logger.info("Tokenizer initialized.")
 except Exception as e:
     logger.error(f"An error occurred while initializing the tokenizer: {e}")
@@ -105,15 +114,16 @@ def __getitem__(self, idx):
 train_dataset = ProteinDataset(train_sequences, train_labels, tokenizer, MAX_LENGTH)
 val_dataset = ProteinDataset(val_sequences, val_labels, tokenizer, MAX_LENGTH)
 
-train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
-val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
+train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
+val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
 
 # Define the model
 class ProteinBERTRegressor(nn.Module):
     def __init__(self):
         super(ProteinBERTRegressor, self).__init__()
         try:
-            self.bert = BertModel.from_pretrained('Rostlab/prot_bert')
+            self.bert = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased')
+            self.dropout = nn.Dropout(0.3)
             self.regressor = nn.Linear(self.bert.config.hidden_size, 1)
             logger.info("Model initialized.")
         except Exception as e:
@@ -123,113 +133,14 @@ def __init__(self):
     def forward(self, input_ids, attention_mask):
         try:
             outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
-            pooler_output = outputs.pooler_output
+            last_hidden_state = outputs.last_hidden_state[:, 0, :]
+            pooler_output = self.dropout(last_hidden_state)
             output = self.regressor(pooler_output)
             return output
         except Exception as e:
             logger.error(f"An error occurred during forward pass: {e}")
             raise
 
-# Initialize the model, loss function, and optimizer
-device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
-model = ProteinBERTRegressor().to(device)
-criterion = nn.MSELoss()
-optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
-
-# Training loop
-train_losses = []
-gradient_accumulation_steps = 4
-val_losses = []
-for epoch in range(EPOCHS):
-    model.train()
-    total_loss = 0
-    for batch_idx, batch in enumerate(train_loader):
-        input_ids = batch['input_ids'].to(device)
-        attention_mask = batch['attention_mask'].to(device)
-        labels = batch['labels'].to(device)
-
-        optimizer.zero_grad()
-        try:
-            outputs = model(input_ids, attention_mask)
-            loss = criterion(outputs.squeeze(), labels)
-            loss = loss / gradient_accumulation_steps
-            loss.backward()
-
-            if (batch_idx + 1) % gradient_accumulation_steps == 0:
-                optimizer.step()
-                optimizer.zero_grad()
-        except Exception as e:
-            logger.error(f"An error occurred during training: {e}")
-            raise
-
-        total_loss += loss.item()
-
-    avg_train_loss = total_loss / len(train_loader)
-    train_losses.append(avg_train_loss)
-    logger.info(f"Epoch {epoch+1}/{EPOCHS}, Training Loss: {avg_train_loss:.4f}")
-
-    # Validation loop
-    model.eval()
-    val_loss = 0
-    val_predictions = []
-    val_labels_list = []
-    val_ids_list = []
-    with torch.no_grad():
-        for batch_idx, batch in enumerate(val_loader):
-            input_ids = batch['input_ids'].to(device)
-            attention_mask = batch['attention_mask'].to(device)
-            labels = batch['labels'].to(device)
-
-            try:
-                outputs = model(input_ids, attention_mask)
-                loss = criterion(outputs.squeeze(), labels)
-                val_loss += loss.item()
-
-                val_predictions.extend(outputs.squeeze().tolist())
-                val_labels_list.extend(labels.tolist())
-                val_ids_list.extend(val_ids[batch_idx * BATCH_SIZE:(batch_idx + 1) * BATCH_SIZE])
-            except Exception as e:
-                logger.error(f"An error occurred during validation: {e}")
-                raise
-
-    avg_val_loss = val_loss / len(val_loader)
-    val_losses.append(avg_val_loss)
-    logger.info(f"Epoch {epoch+1}/{EPOCHS}, Validation Loss: {avg_val_loss:.4f}")
-
-# Plot training and validation loss
-plt.plot(range(1, EPOCHS + 1), train_losses, label='Training Loss')
-plt.plot(range(1, EPOCHS + 1), val_losses, label='Validation Loss')
-plt.xlabel('Epochs')
-plt.ylabel('Loss')
-plt.title('Training and Validation Loss')
-plt.legend()
-plt.show()
-
-# Identify transcripts with highest and lowest relationship to Donor Survival Time
-sorted_indices = np.argsort(val_predictions)
-lowest_transcripts = [val_ids_list[i] for i in sorted_indices[:5]]
-highest_transcripts = [val_ids_list[i] for i in sorted_indices[-5:]]
-
-logger.info("Transcripts with lowest predicted survival times:")
-for transcript in lowest_transcripts:
-    logger.info(transcript)
-
-logger.info("\nTranscripts with highest predicted survival times:")
-for transcript in highest_transcripts:
-    logger.info(transcript)
-
-# Explanation for implementing the model after training
-# To implement the model after training, you can save the model state and use it for inference as follows:
-
-# Save the model
-try:
-    model_path = 'seq_to_pheno/tcga/data/protein_bert_regressor.pth'
-    torch.save(model.state_dict(), model_path)
-    logger.info("Model saved successfully.")
-except Exception as e:
-    logger.error(f"An error occurred while saving the model: {e}")
-    raise
-
 # Load the model for inference
 def load_model(model_path):
     model = ProteinBERTRegressor()
@@ -265,11 +176,112 @@ def predict(model, sequence):
         logger.error(f"An error occurred during prediction: {e}")
         raise
 
-# Load the model and make a prediction
-try:
-    model = load_model(model_path)
-    example_sequence = "MDDYMVLRMIGEGSFGRALLVQHESSNQMFAMKEIRLPKSFSNTQN"
-    predicted_survival_time = predict(model, example_sequence)
-    logger.info(f"Predicted Survival Time: {predicted_survival_time:.2f}")
-except Exception as e:
-    logger.error(f"An error occurred during model prediction: {e}")
\ No newline at end of file
+if __name__ == '__main__':
+  # Initialize the model, loss function, and optimizer
+  device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
+  model = ProteinBERTRegressor().to(device)
+  criterion = nn.SmoothL1Loss()
+  optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
+
+  train_losses = []
+  gradient_accumulation_steps = 4
+  val_losses = []
+  for epoch in range(EPOCHS):
+      model.train()
+      total_loss = 0
+      for batch_idx, batch in enumerate(train_loader):
+          input_ids = batch['input_ids'].to(device)
+          attention_mask = batch['attention_mask'].to(device)
+          labels = batch['labels'].to(device)
+
+          optimizer.zero_grad()
+          try:
+              outputs = model(input_ids, attention_mask)
+              loss = criterion(outputs.squeeze(), labels)
+              loss = loss / gradient_accumulation_steps
+              optimizer.zero_grad(set_to_none=True)
+              loss.backward()
+
+              if (batch_idx + 1) % gradient_accumulation_steps == 0:
+                  optimizer.step()
+                  optimizer.zero_grad()
+          except Exception as e:
+              logger.error(f"An error occurred during training: {e}")
+              raise
+
+          total_loss += loss.item()
+
+      avg_train_loss = total_loss / len(train_loader)
+      train_losses.append(avg_train_loss)
+      logger.info(f"Epoch {epoch+1}/{EPOCHS}, Training Loss: {avg_train_loss:.4f}")
+
+      # Validation loop
+      model.eval()
+      val_loss = 0
+      val_predictions = []
+      val_labels_list = []
+      val_ids_list = []
+      with torch.no_grad():
+          for batch_idx, batch in enumerate(val_loader):
+              input_ids = batch['input_ids'].to(device)
+              attention_mask = batch['attention_mask'].to(device)
+              labels = batch['labels'].to(device)
+
+              try:
+                  outputs = model(input_ids, attention_mask)
+                  loss = criterion(outputs.squeeze(), labels)
+                  val_loss += loss.item()
+
+                  val_predictions.extend(outputs.view(-1).tolist())
+                  val_labels_list.extend(labels.tolist())
+                  val_ids_list.extend(val_ids[batch_idx * BATCH_SIZE : min((batch_idx + 1) * BATCH_SIZE, len(val_ids))])
+              except Exception as e:
+                  logger.error(f"An error occurred during validation: {e}")
+                  raise
+
+      avg_val_loss = val_loss / len(val_loader)
+      val_losses.append(avg_val_loss)
+      logger.info(f"Epoch {epoch+1}/{EPOCHS}, Validation Loss: {avg_val_loss:.4f}")
+
+  # Plot training and validation loss
+  plt.plot(range(1, EPOCHS + 1), train_losses, label='Training Loss')
+  plt.plot(range(1, EPOCHS + 1), val_losses, label='Validation Loss')
+  plt.xlabel('Epochs')
+  plt.ylabel('Loss')
+  plt.title('Training and Validation Loss')
+  plt.legend()
+  plt.show()
+
+  # Identify transcripts with highest and lowest relationship to Donor Survival Time
+  sorted_indices = np.argsort(val_predictions)
+  lowest_transcripts = [val_ids_list[i] for i in sorted_indices[:5]]
+  highest_transcripts = [val_ids_list[i] for i in sorted_indices[-5:]]
+
+  logger.info("Transcripts with lowest predicted survival times:")
+  for transcript in lowest_transcripts:
+      logger.info(transcript)
+
+  logger.info("\nTranscripts with highest predicted survival times:")
+  for transcript in highest_transcripts:
+      logger.info(transcript)
+
+  # Explanation for implementing the model after training
+  # To implement the model after training, you can save the model state and use it for inference as follows:
+
+  # Save the model
+  try:
+      model_path = 'seq_to_pheno/tcga/data/protein_bert_regressor.pth'
+      torch.save(model.state_dict(), model_path)
+      logger.info("Model saved successfully.")
+  except Exception as e:
+      logger.error(f"An error occurred while saving the model: {e}")
+      raise
+
+  # Load the model and make a prediction
+  try:
+      model = load_model(model_path)
+      example_sequence = "MDDYMVLRMIGEGSFGRALLVQHESSNQMFAMKEIRLPKSFSNTQN"
+      predicted_survival_time = predict(model, example_sequence)
+      logger.info(f"Predicted Survival Time: {predicted_survival_time:.2f}")
+  except Exception as e:
+      logger.error(f"An error occurred during model prediction: {e}")
\ No newline at end of file

From 8bc82be814e905a5403cde2c925f89e835e41b42 Mon Sep 17 00:00:00 2001
From: Harrison Reed <harrison.reed@variantyx.com>
Date: Tue, 15 Oct 2024 22:04:01 -0400
Subject: [PATCH 22/22] Fixed fetched protein metadata

---
 .../tcga/create_protein_metadata_table.py     | 57 +++++++++++++++++--
 .../tcga/protein_transformer_example.py       | 55 +++++++++---------
 2 files changed, 80 insertions(+), 32 deletions(-)

diff --git a/seq_to_pheno/tcga/create_protein_metadata_table.py b/seq_to_pheno/tcga/create_protein_metadata_table.py
index b06eb95..634d360 100644
--- a/seq_to_pheno/tcga/create_protein_metadata_table.py
+++ b/seq_to_pheno/tcga/create_protein_metadata_table.py
@@ -1,6 +1,10 @@
 import os
 import pandas as pd
 import argparse
+import time
+import requests
+import numpy as np  # Make sure to import numpy if not already imported
+
 
 def read_fasta_sequence(fasta_path):
     """
@@ -15,6 +19,36 @@ def read_fasta_sequence(fasta_path):
             sequence += line.strip()
     return sequence
 
+def fetch_gene_names(transcript_ids):
+    transcript_ids = list(transcript_ids)
+    server = "https://grch37.rest.ensembl.org"
+    endpoint = "/lookup/id"
+    headers = {"Content-Type": "application/json"}
+    gene_name_map = {}
+
+    batch_size = 200  # Reduce batch size if necessary
+    for i in range(0, len(transcript_ids), batch_size):
+        batch = transcript_ids[i:i+batch_size]
+        data = {"ids": batch}
+        try:
+            response = requests.post(f"{server}{endpoint}", headers=headers, json=data)
+            if not response.ok:
+                response.raise_for_status()
+            decoded = response.json()
+            for tid, info in decoded.items():
+                if info:
+                    gene_name = info.get('display_name')
+                    gene_name_map[tid] = gene_name
+                else:
+                    gene_name_map[tid] = None
+        except requests.exceptions.RequestException as e:
+            print(f"Error fetching gene names for batch starting at index {i}: {e}")
+            # You may choose to retry or skip this batch
+            continue
+        time.sleep(1)  # To respect rate limits
+    return gene_name_map
+
+
 def main():
     # Parse command-line arguments
     parser = argparse.ArgumentParser(description='Create protein sequences metadata table.')
@@ -25,11 +59,11 @@ def main():
     # Paths to directories
     mutated_proteins_dir = 'seq_to_pheno/tcga/data/variants/mutated_proteins'
     wildtype_proteins_dir = 'seq_to_pheno/tcga/data/variants/wildtype_proteins'
-    metadata_file = 'seq_to_pheno/tcga/data/rnaseq.extended.metadata.aliquot_id.V4.tsv.gz'
+    metadata_file = 'seq_to_pheno/tcga/data/new_metadata.csv'
     output_file = 'seq_to_pheno/tcga/data/protein_sequences_metadata.tsv'
 
     # Read the metadata file into a DataFrame
-    metadata_df = pd.read_csv(metadata_file, sep='\t', compression='gzip', low_memory=False)
+    metadata_df = pd.read_csv(metadata_file, sep=',', low_memory=False)
 
     # List all mutated protein FASTA files
     mutated_files = [f for f in os.listdir(mutated_proteins_dir) if f.endswith('.fasta')]
@@ -65,7 +99,6 @@ def main():
             continue
 
         # Extract required metadata fields
-        # Ensure to handle missing data
         sample_metadata = sample_metadata.iloc[0]  # Get the first matching row
 
         # Columns to extract
@@ -107,11 +140,27 @@ def main():
 
         data_rows.append(row)
 
+        # Limit to first 50 entries if including sequences
+        # if args.include_sequences and len(data_rows) >= 50:
+            # break
+
     # Create a DataFrame from the collected rows
     result_df = pd.DataFrame(data_rows)
+    
+    # Ensure transcript IDs are strings
+    result_df['transcript_id'] = result_df['transcript_id'].astype(str)
+    
+    # Clean transcript IDs
+    result_df['transcript_id'] = result_df['transcript_id'].str.split('.').str[0].str.strip()
+
+    # Fetch gene names
+    transcript_ids = result_df['transcript_id'].unique().tolist()
+    gene_name_map = fetch_gene_names(transcript_ids)
+    mapping_df = pd.DataFrame(list(gene_name_map.items()), columns=['transcript_id', 'gene_name'])
+    metadata_with_gene = result_df.merge(mapping_df, on='transcript_id', how='left')
 
     # Save the DataFrame to a TSV file
-    result_df.to_csv(output_file, index=False, sep='\t')
+    metadata_with_gene.to_csv(output_file, index=False, sep='\t')
 
     print(f"Metadata table saved to {output_file}")
 
diff --git a/seq_to_pheno/tcga/protein_transformer_example.py b/seq_to_pheno/tcga/protein_transformer_example.py
index 483838f..e08d62c 100644
--- a/seq_to_pheno/tcga/protein_transformer_example.py
+++ b/seq_to_pheno/tcga/protein_transformer_example.py
@@ -17,18 +17,19 @@
 logger = logging.getLogger(__name__)
 
 # Load the dataset with error handling
-try:
-    df = pd.read_csv('seq_to_pheno/tcga/data/protein_sequences_metadata.tsv', sep='\t')
-    logger.info("Dataset loaded successfully.")
-except FileNotFoundError as e:
-    logger.error(f"File not found: {e}")
-    raise
-except pd.errors.EmptyDataError as e:
-    logger.error(f"Empty data error: {e}")
-    raise
-except Exception as e:
-    logger.error(f"An error occurred while loading the dataset: {e}")
-    raise
+if 'df' not in globals():
+    try:
+        df = pd.read_csv('seq_to_pheno/tcga/data/protein_sequences_metadata.tsv', sep='	')
+        logger.info("Dataset loaded successfully.")
+    except FileNotFoundError as e:
+        logger.error(f"File not found: {e}")
+        raise
+    except pd.errors.EmptyDataError as e:
+        logger.error(f"Empty data error: {e}")
+        raise
+    except Exception as e:
+        logger.error(f"An error occurred while loading the dataset: {e}")
+        raise
 
 # Extract relevant columns
 try:
@@ -41,22 +42,20 @@
     raise
 
 # Split the dataset into training and validation sets
-try:
-    train_sequences, val_sequences, train_labels, val_labels, train_ids, val_ids = train_test_split(
-        sequences, survival_times, transcript_ids, test_size=0.2, random_state=42
-    )
-    # Normalize survival times
-    train_labels_mean = np.mean(train_labels)
-    train_labels_std = np.std(train_labels)
-    train_labels = (train_labels - train_labels_mean) / train_labels_std
-    val_labels = (val_labels - train_labels_mean) / train_labels_std
-    # Normalize survival times
-    train_labels = (train_labels - np.mean(train_labels)) / np.std(train_labels)
-    val_labels = (val_labels - np.mean(train_labels)) / np.std(train_labels)
-    logger.info("Dataset split into training and validation sets.")
-except Exception as e:
-    logger.error(f"An error occurred during dataset splitting: {e}")
-    raise
+if 'train_sequences' not in globals():
+    try:
+        train_sequences, val_sequences, train_labels, val_labels, train_ids, val_ids = train_test_split(
+            sequences, survival_times, transcript_ids, test_size=0.2, random_state=42
+        )
+        # Normalize survival times
+        train_labels_mean = np.mean(train_labels)
+        train_labels_std = np.std(train_labels)
+        train_labels = (train_labels - train_labels_mean) / train_labels_std
+        val_labels = (val_labels - train_labels_mean) / train_labels_std
+        logger.info("Dataset split into training and validation sets.")
+    except Exception as e:
+        logger.error(f"An error occurred during dataset splitting: {e}")
+        raise
 
 # Define custom dataset class
 class ProteinDataset(Dataset):