Skip to content

Commit

Permalink
typo
Browse files Browse the repository at this point in the history
  • Loading branch information
pipparichter committed Jan 14, 2025
1 parent 111ce96 commit 5d30c6a
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions scripts/reformat-gtdb-embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def to_df(self):
return df


def read_proteins_file(genome_id:str, prefix:str, dir_:str=None, file_name_format:str='{prefix}_{genome_id}_protein.faa') -> pd.DataFrame:
def load_proteins(genome_id:str, prefix:str, dir_:str=None, file_name_format:str='{prefix}_{genome_id}_protein.faa') -> pd.DataFrame:

file_name = file_name_format.format(genome_id=genome_id, prefix=prefix)
path = os.path.join(dir_, file_name)
Expand All @@ -52,7 +52,7 @@ def parse_description(description:str):
return df


def read_embeddings_file(genome_id:str, prefix:str, proteins_dir:str=None, file_name_format:str='{prefix}_{genome_id}_protein.faa') -> pd.DataFrame:
def load_embeddings(genome_id:str, prefix:str, proteins_dir:str=None, file_name_format:str='{prefix}_{genome_id}_protein.faa') -> pd.DataFrame:

file_name = file_name_format.format(genome_id=genome_id, prefix=prefix)
path = os.path.join(dir_, file_name)
Expand All @@ -78,8 +78,8 @@ def read_embeddings_file(genome_id:str, prefix:str, proteins_dir:str=None, file_
pbar = tqdm(total=len(genome_metadata_df), desc='Reading genome data...')
for row in genome_metadata_df.itertuples():
genome_id, prefix = row.Index, row.prefix
proteins_df = read_proteins_file(genome_id, prefix, dir_=args.proteins_dir, file_name_format=args.proteins_file_name_format)
proteins_df = read_proteins_file(genome_id, prefix, dir_=args.embeddings_dir, file_name_format=args.embeddings_file_name_format)
proteins_df = load_proteins(genome_id, prefix, dir_=args.proteins_dir, file_name_format=args.proteins_file_name_format)
embeddings_df = load_embeddings(genome_id, prefix, dir_=args.embeddings_dir, file_name_format=args.embeddings_file_name_format)

# Remove sequences which exceed the maximum length specification.
length_filter = proteins_df.seq.apply(len) < 2000
Expand Down

0 comments on commit 5d30c6a

Please sign in to comment.