Skip to content

Commit

Permalink
more
Browse files Browse the repository at this point in the history
  • Loading branch information
pipparichter committed Jan 7, 2025
1 parent 057a471 commit 9e2db28
Show file tree
Hide file tree
Showing 23 changed files with 1,068 additions and 5,709 deletions.
Binary file modified models/binary_model_aa_1mer.pkl
Binary file not shown.
Binary file modified models/binary_model_aa_2mer.pkl
Binary file not shown.
Binary file modified models/binary_model_aa_3mer.pkl
Binary file not shown.
Binary file removed models/binary_model_aac.pkl
Binary file not shown.
Binary file modified models/binary_model_len.pkl
Binary file not shown.
Binary file modified models/binary_model_plm.pkl
Binary file not shown.
Binary file modified models/ternary_model_aa_1mer.pkl
Binary file not shown.
Binary file modified models/ternary_model_aa_2mer.pkl
Binary file not shown.
Binary file modified models/ternary_model_aa_3mer.pkl
Binary file not shown.
Binary file removed models/ternary_model_aac.pkl
Binary file not shown.
Binary file modified models/ternary_model_len.pkl
Binary file not shown.
Binary file modified models/ternary_model_plm.pkl
Binary file not shown.
439 changes: 439 additions & 0 deletions notebooks/bac20_r207_proteins_subset.ipynb

Large diffs are not rendered by default.

2,882 changes: 0 additions & 2,882 deletions notebooks/make-figures-v2.ipynb

This file was deleted.

3,071 changes: 573 additions & 2,498 deletions notebooks/sandbox.ipynb

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions notebooks/test.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
244 changes: 0 additions & 244 deletions notebooks/validation.ipynb

This file was deleted.

48 changes: 1 addition & 47 deletions scripts/predict-gtdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,6 @@
from selenobot.datasets import Dataset
from typing import List

# Define some important directories...
ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')
RESULTS_DIR = os.path.join(ROOT_DIR, 'results') # Get the path where results are stored.
MODELS_DIR = os.path.join(ROOT_DIR, 'models')
DATA_DIR = os.path.join(ROOT_DIR, 'data') # Get the path where results are stored.
SCRIPTS_DIR = os.path.join(ROOT_DIR, 'scripts') # Get the path where results are stored.

EMBEDDINGS_DIR = '/central/groups/fischergroup/prichter/gtdb/embeddings'

# Count queries are kind of slow, so just storing the table sizes as variables.
Expand All @@ -35,46 +28,7 @@
# The selenocysteine-specific elongation factor, SelB, recognizes an in-frame stop codon followed by a selenocysteine insertion
# sequence (SECIS).

SELD_KO = 'K01008'
SELA_KO = 'K01042'
SELB_KO = 'K03833'

# sbatch --mem 100GB --time 100:00:00 --wrap "python predict-gtdb.py --results-dir /central/groups/fischergroup/prichter/selenobot/results/"
def get_copy_numbers(output_path:str=None):
'''Take what is probably a faster approach to finding copy numbers, which is to grab every annotation which matches one of the
selenoprotein genes, and then group the result by genome.'''
query = Query('annotations_kegg')
query.equal_to('ko', [SELA_KO, SELB_KO, SELD_KO])
results_dir, _ = os.path.split(output_path)

total = query.count()
print(f'get_copy_numbers: {total} genes annotated as selA, selB, or selD.')

page_df = query.next()
selabd_annotations_df = []
pbar = tqdm(total=total, desc='get_copy_numbers: Retrieving selenoprotein gene copy numbers... (page 0)')
while page_df is not None:
selabd_annotations_df.append(page_df)
pbar.update(len(page_df))
pbar.set_description(f'get_copy_numbers: Retrieving selenoprotein gene copy numbers... (page {len(selabd_annotations_df)})')
page_df = query.next()
selabd_annotations_df = pd.concat(selabd_annotations_df)
# Save the annotation results as an intermediate.
selabd_annotations_df.set_index('id').to_csv(os.path.join(results_dir, 'gtdb_selabd_annotations.csv'))

copy_nums_df = []
for genome_id, genome_id_df in selabd_annotations_df.groupby('genome_id'):
row = dict()
row['seld_copy_num'] = np.sum(genome_id_df.ko == SELD_KO).item()
row['sela_copy_num'] = np.sum(genome_id_df.ko == SELA_KO).item()
row['selb_copy_num'] = np.sum(genome_id_df.ko == SELB_KO).item()
row['genome_id'] = genome_id
copy_nums_df.append(row)
copy_nums_df = pd.DataFrame(copy_nums_df)

copy_nums_df = pd.DataFrame(copy_nums_df).set_index('genome_id')
copy_nums_df.to_csv(os.path.join(RESULTS_DIR, 'gtdb_copy_nums.csv'))
print(f"get_copy_numbers: Copy number information written to {output_path}")



def get_genome_data(genome_ids:List[str], batch_size=50, output_path:str=None):
Expand Down
2 changes: 1 addition & 1 deletion scripts/predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@

model_name = f'ternary_model_{args.type}.pkl' if (args.n_classes == 3) else f'binary_model_{args.type}.pkl'
model_path = os.path.join(args.models_dir, model_name)

model = Classifier.load(model_path)
print(f'Predicting using {model_name} trained on {model.time_stamp}.')

dataset = Dataset.from_hdf(args.input_path, feature_type=args.type, n_classes=args.n_classes)

Expand Down
27 changes: 0 additions & 27 deletions scripts/sandbox.py

This file was deleted.

1 change: 0 additions & 1 deletion selenobot/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,6 @@ def predict(self, dataset) -> pd.DataFrame:

# Organize the predictions into a DataFrame.
predictions = pd.DataFrame(index=dataset.ids)
# for i, category in dataset.categories.items():
for i in range(outputs.shape[-1]):
predictions[f'probability_{dataset.categories[i]}'] = outputs[:, i].ravel()
predictions['prediction'] = np.argmax(outputs, axis=1).ravel() # Convert out of one-hot encodings.
Expand Down
43 changes: 35 additions & 8 deletions selenobot/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ class BLASTFile(File):

fields = ['qseqid', 'sseqid','pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] # This is the default.
fields += ['qcovs', 'qcovhsp', 'qlen', 'slen'] # Some extra stuff which is helpful.

field_map = dict()
field_map['qseqid'] = 'query_id' # Query or source (gene) sequence id
field_map['sseqid'] = 'subject_id' # Subject or target (gene) sequence id
Expand All @@ -155,7 +155,21 @@ class BLASTFile(File):
field_map['slen'] = 'subject_sequence_length'
# https://www.biostars.org/p/121972/
field_map['qcovs'] = 'query_coverage_per_subject'
field_map['qcovhsp'] = 'query_coverage_per_pair'
field_map['qcovhsp'] = 'query_coverage_per_pair' # This seems to be a percentage, equal to alignment_length / query_sequence_length.

@staticmethod
def remove_swissprot_tag(id_:str):
'''I am not sure why the BLAST tool has started doing this, but it is appending a sp|{id_}| tag to some
of the subject sequences, which is not present in the original FASTA file.'''
match = re.match(r'sp\|([A-Za-z0-9]+)\|', id_)
return match.group(1) if (match is not None) else id_

@staticmethod
def adjust_sequence_identity(row):

adjusted_sequence_identity = (row.alignment_length - row.mismatch)
adjusted_sequence_identity = adjusted_sequence_identity / max(row.query_sequence_length, row.subject_sequence_length)
return adjusted_sequence_identity

def __init__(self, path:str):

Expand All @@ -164,16 +178,29 @@ def __init__(self, path:str):
self.df['id'] = self.df.query_id # Use the query ID as the main ID.
self.df = self.df.set_index('id')

self.df['subject_id'] = self.df.subject_id.apply(lambda id_ : BLASTFile.remove_swissprot_tag(id_))
# Adjust the sequence identity to account for alignment length.
self.df['adjusted_sequence_identity'] = df.apply(BLASTFile.adjust_sequence_identity, axis=1)

# NOTE: There can be alignments for different portions of the query and target sequences, which this does not account for.
# I am counting on the fact that this will not effect things much.

def drop_duplicate_hsps(self, col:str='adjusted_sequence_identity', how:str='highest'):
'''A BLAST result can have multiple alignments for the same query-subject pair (each of these alignments is called
an HSP). If you specify that you only want one HSP per query-target pair, BLAST will just select the alignment with
the lowest E-value. However, I care more about sequence identity, so I am selecting best HSPs manually.'''

# def drop_duplicates(self, keep_highest:str=''):
# ''''''
# fields = ['query_id', 'subject_id']
# for query_id, query_df in self.df.groupby('query_id'):


# There are two relevant parameters per HSP: sequence_identity and length (the length of the aligned region)
if how == 'lowest':
ascending = True
elif how == 'highest':
ascending = False
else:
print('BLASTFile: Specified selection method must be one of \'highest\', \'lowest\'.')

self.df = self.df.sort_values(col, ascending=ascending)
self.df.drop_duplicates(subset=['query_id', 'subject_id'], keep='first', inplace=True)

def to_df(self) -> pd.DataFrame:
return self.df

Expand Down
2 changes: 1 addition & 1 deletion selenobot/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def run(self, query_path:str, subject_path:str, overwrite:bool=False, verbose:bo
max_high_scoring_pairs:int=None,
max_subject_sequences:int=None,
make_database:bool=True,
max_e_value:float=1e-5,
max_e_value:float=None,
num_threads:int=4) -> str:
'''Run the blastp program on the query and subject files.
Expand Down

0 comments on commit 9e2db28

Please sign in to comment.