From 5d30c6acf4ca99a77ae621abe0e7b71dc49af03e Mon Sep 17 00:00:00 2001 From: Philippa Richter Date: Tue, 14 Jan 2025 14:57:43 -0800 Subject: [PATCH] typo --- scripts/reformat-gtdb-embeddings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/reformat-gtdb-embeddings.py b/scripts/reformat-gtdb-embeddings.py index 692f162..a921da1 100644 --- a/scripts/reformat-gtdb-embeddings.py +++ b/scripts/reformat-gtdb-embeddings.py @@ -34,7 +34,7 @@ def to_df(self): return df -def read_proteins_file(genome_id:str, prefix:str, dir_:str=None, file_name_format:str='{prefix}_{genome_id}_protein.faa') -> pd.DataFrame: +def load_proteins(genome_id:str, prefix:str, dir_:str=None, file_name_format:str='{prefix}_{genome_id}_protein.faa') -> pd.DataFrame: file_name = file_name_format.format(genome_id=genome_id, prefix=prefix) path = os.path.join(dir_, file_name) @@ -52,7 +52,7 @@ def parse_description(description:str): return df -def read_embeddings_file(genome_id:str, prefix:str, proteins_dir:str=None, file_name_format:str='{prefix}_{genome_id}_protein.faa') -> pd.DataFrame: +def load_embeddings(genome_id:str, prefix:str, proteins_dir:str=None, file_name_format:str='{prefix}_{genome_id}_protein.faa') -> pd.DataFrame: file_name = file_name_format.format(genome_id=genome_id, prefix=prefix) path = os.path.join(dir_, file_name) @@ -78,8 +78,8 @@ def read_embeddings_file(genome_id:str, prefix:str, proteins_dir:str=None, file_ pbar = tqdm(total=len(genome_metadata_df), desc='Reading genome data...') for row in genome_metadata_df.itertuples(): genome_id, prefix = row.Index, row.prefix - proteins_df = read_proteins_file(genome_id, prefix, dir_=args.proteins_dir, file_name_format=args.proteins_file_name_format) - proteins_df = read_proteins_file(genome_id, prefix, dir_=args.embeddings_dir, file_name_format=args.embeddings_file_name_format) + proteins_df = load_proteins(genome_id, prefix, dir_=args.proteins_dir, file_name_format=args.proteins_file_name_format) + embeddings_df = load_embeddings(genome_id, prefix, dir_=args.embeddings_dir, file_name_format=args.embeddings_file_name_format) # Remove sequences which exceed the maximum length specification. length_filter = proteins_df.seq.apply(len) < 2000