more

pipparichter · Jan 7, 2025 · 9e2db28 · 9e2db28
1 parent 057a471
commit 9e2db28
Show file tree

Hide file tree

Showing 23 changed files with 1,068 additions and 5,709 deletions.
diff --git a/models/binary_model_aa_1mer.pkl b/models/binary_model_aa_1mer.pkl
diff --git a/models/binary_model_aa_2mer.pkl b/models/binary_model_aa_2mer.pkl
diff --git a/models/binary_model_aa_3mer.pkl b/models/binary_model_aa_3mer.pkl
diff --git a/models/binary_model_aac.pkl b/models/binary_model_aac.pkl
diff --git a/models/binary_model_len.pkl b/models/binary_model_len.pkl
diff --git a/models/binary_model_plm.pkl b/models/binary_model_plm.pkl
diff --git a/models/ternary_model_aa_1mer.pkl b/models/ternary_model_aa_1mer.pkl
diff --git a/models/ternary_model_aa_2mer.pkl b/models/ternary_model_aa_2mer.pkl
diff --git a/models/ternary_model_aa_3mer.pkl b/models/ternary_model_aa_3mer.pkl
diff --git a/models/ternary_model_aac.pkl b/models/ternary_model_aac.pkl
diff --git a/models/ternary_model_len.pkl b/models/ternary_model_len.pkl
diff --git a/models/ternary_model_plm.pkl b/models/ternary_model_plm.pkl
diff --git a/notebooks/bac20_r207_proteins_subset.ipynb b/notebooks/bac20_r207_proteins_subset.ipynb
diff --git a/notebooks/make-figures-v2.ipynb b/notebooks/make-figures-v2.ipynb
diff --git a/notebooks/sandbox.ipynb b/notebooks/sandbox.ipynb
diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb
@@ -0,0 +1,18 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/validation.ipynb b/notebooks/validation.ipynb
diff --git a/scripts/predict-gtdb.py b/scripts/predict-gtdb.py
@@ -11,13 +11,6 @@
 from selenobot.datasets import Dataset 
 from typing import List 
 
-# Define some important directories...
-ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
-RESULTS_DIR = os.path.join(ROOT_DIR, 'results') # Get the path where results are stored.
-MODELS_DIR = os.path.join(ROOT_DIR, 'models')
-DATA_DIR = os.path.join(ROOT_DIR, 'data') # Get the path where results are stored. 
-SCRIPTS_DIR = os.path.join(ROOT_DIR, 'scripts') # Get the path where results are stored.
-
 EMBEDDINGS_DIR = '/central/groups/fischergroup/prichter/gtdb/embeddings'
 
 # Count queries are kind of slow, so just storing the table sizes as variables. 
@@ -35,46 +28,7 @@
 #       The selenocysteine-specific elongation factor, SelB, recognizes an in-frame stop codon followed by a selenocysteine insertion 
 #       sequence (SECIS).
 
-SELD_KO = 'K01008' 
-SELA_KO = 'K01042' 
-SELB_KO = 'K03833' 
-
-# sbatch --mem 100GB --time 100:00:00 --wrap "python predict-gtdb.py --results-dir /central/groups/fischergroup/prichter/selenobot/results/"
-def get_copy_numbers(output_path:str=None):
-    '''Take what is probably a faster approach to finding copy numbers, which is to grab every annotation which matches one of the 
-    selenoprotein genes, and then group the result by genome.'''
-    query = Query('annotations_kegg')
-    query.equal_to('ko', [SELA_KO, SELB_KO, SELD_KO])
-    results_dir, _ = os.path.split(output_path)
-
-    total = query.count() 
-    print(f'get_copy_numbers: {total} genes annotated as selA, selB, or selD.')
-
-    page_df = query.next()
-    selabd_annotations_df = []
-    pbar = tqdm(total=total, desc='get_copy_numbers: Retrieving selenoprotein gene copy numbers... (page 0)')
-    while page_df is not None:
-        selabd_annotations_df.append(page_df)
-        pbar.update(len(page_df))
-        pbar.set_description(f'get_copy_numbers: Retrieving selenoprotein gene copy numbers... (page {len(selabd_annotations_df)})')
-        page_df = query.next() 
-    selabd_annotations_df = pd.concat(selabd_annotations_df)
-    # Save the annotation results as an intermediate. 
-    selabd_annotations_df.set_index('id').to_csv(os.path.join(results_dir, 'gtdb_selabd_annotations.csv'))
-
-    copy_nums_df = []
-    for genome_id, genome_id_df in selabd_annotations_df.groupby('genome_id'):
-        row = dict()
-        row['seld_copy_num'] = np.sum(genome_id_df.ko == SELD_KO).item()
-        row['sela_copy_num'] = np.sum(genome_id_df.ko == SELA_KO).item()
-        row['selb_copy_num'] = np.sum(genome_id_df.ko == SELB_KO).item()
-        row['genome_id'] = genome_id
-        copy_nums_df.append(row)
-    copy_nums_df = pd.DataFrame(copy_nums_df)
-
-    copy_nums_df = pd.DataFrame(copy_nums_df).set_index('genome_id')
-    copy_nums_df.to_csv(os.path.join(RESULTS_DIR, 'gtdb_copy_nums.csv'))
-    print(f"get_copy_numbers: Copy number information written to {output_path}")
+
 
 
 def get_genome_data(genome_ids:List[str], batch_size=50, output_path:str=None):

diff --git a/scripts/predict.py b/scripts/predict.py
@@ -21,8 +21,8 @@
 
     model_name = f'ternary_model_{args.type}.pkl' if (args.n_classes == 3) else f'binary_model_{args.type}.pkl' 
     model_path = os.path.join(args.models_dir, model_name)
-
     model = Classifier.load(model_path)
+    print(f'Predicting using {model_name} trained on {model.time_stamp}.')
 
     dataset = Dataset.from_hdf(args.input_path, feature_type=args.type, n_classes=args.n_classes)
 

diff --git a/scripts/sandbox.py b/scripts/sandbox.py
diff --git a/selenobot/classifiers.py b/selenobot/classifiers.py
@@ -182,7 +182,6 @@ def predict(self, dataset) -> pd.DataFrame:
 
             # Organize the predictions into a DataFrame.
             predictions = pd.DataFrame(index=dataset.ids)
-            # for i, category in dataset.categories.items():
             for i in range(outputs.shape[-1]):
                 predictions[f'probability_{dataset.categories[i]}'] = outputs[:, i].ravel()
             predictions['prediction'] = np.argmax(outputs, axis=1).ravel() # Convert out of one-hot encodings.

diff --git a/selenobot/files.py b/selenobot/files.py
@@ -136,7 +136,7 @@ class BLASTFile(File):
 
     fields = ['qseqid', 'sseqid','pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] # This is the default. 
     fields += ['qcovs', 'qcovhsp', 'qlen', 'slen'] # Some extra stuff which is helpful. 
-    
+
     field_map = dict()
     field_map['qseqid'] = 'query_id' # Query or source (gene) sequence id
     field_map['sseqid'] = 'subject_id' # Subject or target (gene) sequence id
@@ -155,7 +155,21 @@ class BLASTFile(File):
     field_map['slen'] = 'subject_sequence_length' 
     # https://www.biostars.org/p/121972/
     field_map['qcovs'] = 'query_coverage_per_subject' 
-    field_map['qcovhsp'] = 'query_coverage_per_pair' 
+    field_map['qcovhsp'] = 'query_coverage_per_pair' # This seems to be a percentage, equal to alignment_length / query_sequence_length.
+
+    @staticmethod
+    def remove_swissprot_tag(id_:str):
+        '''I am not sure why the BLAST tool has started doing this, but it is appending a sp|{id_}| tag to some
+        of the subject sequences, which is not present in the original FASTA file.'''
+        match = re.match(r'sp\|([A-Za-z0-9]+)\|', id_)
+        return match.group(1) if (match is not None) else id_
+
+    @staticmethod
+    def adjust_sequence_identity(row):
+
+        adjusted_sequence_identity = (row.alignment_length - row.mismatch)
+        adjusted_sequence_identity = adjusted_sequence_identity / max(row.query_sequence_length, row.subject_sequence_length)
+        return adjusted_sequence_identity
 
     def __init__(self, path:str):
 
@@ -164,16 +178,29 @@ def __init__(self, path:str):
         self.df['id'] = self.df.query_id # Use the query ID as the main ID. 
         self.df = self.df.set_index('id')
 
+        self.df['subject_id'] = self.df.subject_id.apply(lambda id_ : BLASTFile.remove_swissprot_tag(id_))
         # Adjust the sequence identity to account for alignment length. 
+        self.df['adjusted_sequence_identity'] = df.apply(BLASTFile.adjust_sequence_identity, axis=1)
 
+    # NOTE: There can be alignments for different portions of the query and target sequences, which this does not account for. 
+    # I am counting on the fact that this will not effect things much. 
 
+    def drop_duplicate_hsps(self, col:str='adjusted_sequence_identity', how:str='highest'):
+        '''A BLAST result can have multiple alignments for the same query-subject pair (each of these alignments is called
+        an HSP). If you specify that you only want one HSP per query-target pair, BLAST will just select the alignment with
+        the lowest E-value. However, I care more about sequence identity, so I am selecting best HSPs manually.'''
 
-    # def drop_duplicates(self, keep_highest:str=''):
-    #     ''''''
-    #     fields = ['query_id', 'subject_id']
-    #     for query_id, query_df in self.df.groupby('query_id'):
-
-
+        # There are two relevant parameters per HSP: sequence_identity and length (the length of the aligned region)
+        if how == 'lowest':
+            ascending = True 
+        elif how == 'highest':
+            ascending = False
+        else:
+            print('BLASTFile: Specified selection method must be one of \'highest\', \'lowest\'.')
+
+        self.df = self.df.sort_values(col, ascending=ascending)
+        self.df.drop_duplicates(subset=['query_id', 'subject_id'], keep='first', inplace=True)
+
     def to_df(self) -> pd.DataFrame:
         return self.df
 

diff --git a/selenobot/tools.py b/selenobot/tools.py
@@ -36,7 +36,7 @@ def run(self, query_path:str, subject_path:str, overwrite:bool=False, verbose:bo
             max_high_scoring_pairs:int=None, 
             max_subject_sequences:int=None,
             make_database:bool=True,
-            max_e_value:float=1e-5,
+            max_e_value:float=None,
             num_threads:int=4) -> str:
         '''Run the blastp program on the query and subject files.