Skip to content

Commit

Permalink
Merge pull request #259 from Knowledge-Graph-Hub/proteome_handling
Browse files Browse the repository at this point in the history
Get only assembled proteomes if available, remove redundant proteomes
  • Loading branch information
realmarcin authored Nov 1, 2024
2 parents c153de7 + f1b15b0 commit 5d146d4
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 74 deletions.
2 changes: 1 addition & 1 deletion hpc/run_kg_transform_uniprot.sl
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ module load python/3.10
cd /global/cfs/cdirs/m4689/master/kg-microbe
source venv/bin/activate
git checkout master
poetry run kg transform -s UniprotFunctionalMicrobesTransform
poetry run kg transform -s uniprot_functional_microbes
4 changes: 0 additions & 4 deletions kg_microbe/transform_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,11 +437,8 @@
UNIPROT_PREFIX = "UniprotKB:"
CHEMICAL_TO_PROTEIN_EDGE = "biolink:binds"
# PROTEIN_TO_GO_EDGE = "biolink:enables"
PROTEOME_TO_ORGANISM_EDGE = "biolink:derives_from"
PROTEIN_TO_ORGANISM_EDGE = "biolink:derives_from"
ORGANISM_TO_PROTEIN_EDGE = "biolink:expresses"
PROTEIN_TO_PROTEOME_EDGE = "biolink:derives_from"
PROTEOME_CATEGORY = "biolink:Genome"
PROTEIN_TO_EC_EDGE = "biolink:enables"
EC_CATEGORY = "biolink:Enzyme"
PROTEIN_TO_RHEA_EDGE = "biolink:participates_in"
Expand Down Expand Up @@ -470,7 +467,6 @@
GENE_CATEGORY = "biolink:Gene"

PROTEOME_ID_COLUMN = "proteome_id"
PROTEOME_PREFIX = "Proteomes:"
UNIPROT_DATA_LIST = [
"archaea",
"bacteria",
Expand Down
31 changes: 19 additions & 12 deletions kg_microbe/utils/pandas_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,31 +25,38 @@ def drop_duplicates(
consolidation_columns: List = None,
):
"""
Read TSV, drop duplicates and export to same file.
Read TSV, drop duplicates, and export to the same file without making unnecessary copies.
:param df: Dataframe
:param file_path: file path.
:param file_path: Path to the TSV file.
:param sort_by_column: Column name to sort the DataFrame.
:param consolidation_columns: List of columns to consolidate.
"""
exclude_prefixes = DO_NOT_CHANGE_PREFIXES
df = pd.read_csv(file_path, sep="\t", low_memory=False)
df_copy = df.copy()
if consolidation_columns and all(col in list(df_copy.columns) for col in consolidation_columns):

# Store the original NAME_COLUMN if it's in consolidation_columns
if consolidation_columns and NAME_COLUMN in consolidation_columns:
original_name_column = df[NAME_COLUMN].copy()

if consolidation_columns and all(col in df.columns for col in consolidation_columns):
for col in consolidation_columns:
df_copy[col] = df_copy[col].apply(
df[col] = df[col].apply(
lambda x: (
str(x).lower()
if not any(str(x).startswith(prefix) for prefix in exclude_prefixes)
else x
)
)
df_copy = df_copy.drop_duplicates().sort_values(by=[sort_by_column])
# Replace the "name" column with the original values

df.drop_duplicates(inplace=True)
df.sort_values(by=[sort_by_column], inplace=True)

# Restore the original values of the NAME_COLUMN
if consolidation_columns and NAME_COLUMN in consolidation_columns:
# replace df_copy[NAME_COLUMN] with the original values based on index match
df_copy[NAME_COLUMN] = df.loc[df_copy.index, NAME_COLUMN]
df[NAME_COLUMN] = original_name_column.loc[df.index]

df_copy.to_csv(file_path, sep="\t", index=False)
return df_copy
df.to_csv(file_path, sep="\t", index=False)
return df


def establish_transitive_relationship(
Expand Down
87 changes: 30 additions & 57 deletions kg_microbe/utils/uniprot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,7 @@
PROTEIN_TO_GO_CELLULAR_COMPONENT_EDGE,
PROTEIN_TO_GO_MOLECULAR_FUNCTION_EDGE,
PROTEIN_TO_ORGANISM_EDGE,
PROTEIN_TO_PROTEOME_EDGE,
PROTEIN_TO_RHEA_EDGE,
PROTEOME_CATEGORY,
PROTEOME_PREFIX,
PROTEOME_TO_ORGANISM_EDGE,
RDFS_SUBCLASS_OF,
RHEA_CATEGORY,
UNIPROT_BINDING_SITE_COLUMN_NAME,
Expand All @@ -71,8 +67,6 @@

RELATIONS_DICT = {
PROTEIN_TO_ORGANISM_EDGE: DERIVES_FROM,
PROTEOME_TO_ORGANISM_EDGE: DERIVES_FROM,
PROTEIN_TO_PROTEOME_EDGE: DERIVES_FROM,
PROTEIN_TO_EC_EDGE: ENABLES,
CHEMICAL_TO_PROTEIN_EDGE: MOLECULARLY_INTERACTS_WITH,
PROTEIN_TO_RHEA_EDGE: PARTICIPATES_IN,
Expand Down Expand Up @@ -102,7 +96,6 @@
BINDING_SITE_PARSED_COLUMN = "binding_site_parsed"
GO_PARSED_COLUMN = "go_parsed"
RHEA_PARSED_COLUMN = "rhea_parsed"
PROTEOME_PARSED_COLUMN = "proteome_parsed"
DISEASE_PARSED_COLUMN = "disease_parsed"
GENE_PRIMARY_PARSED_COLUMN = "gene_primary_parsed"
GO_TERM_COLUMN = "GO_Term"
Expand Down Expand Up @@ -315,7 +308,6 @@ def get_nodes_and_edges(
edge_data = []
parsed_columns = [
ORGANISM_PARSED_COLUMN,
PROTEOME_PARSED_COLUMN,
EC_NUMBER_PARSED_COLUMN,
PROTEIN_ID_PARSED_COLUMN,
BINDING_SITE_PARSED_COLUMN,
Expand Down Expand Up @@ -352,11 +344,6 @@ def get_nodes_and_edges(
uniprot_parse_df[RHEA_PARSED_COLUMN] = uniprot_df[UNIPROT_RHEA_ID_COLUMN_NAME].apply(
parse_rhea_entry
)
uniprot_parse_df[PROTEOME_PARSED_COLUMN] = uniprot_df[UNIPROT_PROTEOME_COLUMN_NAME].apply(
lambda x: (
PROTEOME_PREFIX + x.split(":")[0].strip() if x and not is_float(x) and x != "" else x
)
)
# Fields only in human uniprot query
if UNIPROT_DISEASE_COLUMN_NAME in uniprot_df.columns:
uniprot_parse_df[DISEASE_PARSED_COLUMN] = uniprot_df[UNIPROT_DISEASE_COLUMN_NAME].apply(
Expand All @@ -382,17 +369,6 @@ def get_nodes_and_edges(
]
)

# Proteome node
node_data.append(
[
entry[PROTEOME_PARSED_COLUMN],
PROTEOME_CATEGORY,
entry[PROTEOME_PARSED_COLUMN],
"",
"",
source_name,
]
)
# EC node
if entry[EC_NUMBER_PARSED_COLUMN]:
for ec in entry[EC_NUMBER_PARSED_COLUMN]:
Expand Down Expand Up @@ -490,39 +466,16 @@ def get_nodes_and_edges(
)

# Removing protein-organism edges for now
# # Protein-organism
# edge_data.append(
# [
# entry[PROTEIN_ID_PARSED_COLUMN],
# PROTEIN_TO_ORGANISM_EDGE,
# entry[ORGANISM_PARSED_COLUMN],
# RELATIONS_DICT[PROTEIN_TO_ORGANISM_EDGE],
# source_name,
# ]
# )

# Proteome-organism
if entry[PROTEOME_PARSED_COLUMN]:
edge_data.append(
[
entry[PROTEOME_PARSED_COLUMN],
PROTEOME_TO_ORGANISM_EDGE,
entry[ORGANISM_PARSED_COLUMN],
RELATIONS_DICT[PROTEIN_TO_ORGANISM_EDGE],
source_name,
]
)
# Protein-proteome
if entry[PROTEIN_ID_PARSED_COLUMN]:
edge_data.append(
[
entry[PROTEIN_ID_PARSED_COLUMN],
PROTEIN_TO_PROTEOME_EDGE,
entry[PROTEOME_PARSED_COLUMN],
RELATIONS_DICT[PROTEIN_TO_PROTEOME_EDGE],
source_name,
]
)
# Protein-organism
edge_data.append(
[
entry[PROTEIN_ID_PARSED_COLUMN],
PROTEIN_TO_ORGANISM_EDGE,
entry[ORGANISM_PARSED_COLUMN],
RELATIONS_DICT[PROTEIN_TO_ORGANISM_EDGE],
source_name,
]
)

return (node_data, edge_data)

Expand Down Expand Up @@ -691,7 +644,27 @@ def check_string_in_tar(
# Split the content into lines and count occurrences
lines = content.splitlines()
count = sum(bool(pattern.search(line)) for line in lines)
# No assembled proteomes case, otherwise only grab assembled proteomes
if count == 0:
count = len(lines)
if count > min_line_count:
# Get only first proteome listed if multiple
proteome_column_index = next(
(
i
for i, value in enumerate(lines[0].split("\t"))
if UNIPROT_PROTEOME_COLUMN_NAME == value
),
None,
)
lines = [
"\t".join(
line.split("\t")[:proteome_column_index]
+ [line.split("\t")[proteome_column_index].split(";")[0]]
+ line.split("\t")[proteome_column_index + 1 :]
)
for line in lines
]
# Add only unique lines to the set
matching_members_content.extend(lines)
# Add the member name to the list
Expand Down

0 comments on commit 5d146d4

Please sign in to comment.