Skip to content

Commit

Permalink
remove potential duplicate rows from table
Browse files Browse the repository at this point in the history
  • Loading branch information
ammarcsj committed Apr 22, 2024
1 parent 0cf7b7d commit eb7b23e
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 0 deletions.
1 change: 1 addition & 0 deletions directlfq/lfq_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def run_lfq(input_file, columns_to_add = [], selected_proteins_file :str = None
input_df = lfqutils.sort_input_df_by_protein_id(input_df)
input_df = lfqutils.index_and_log_transform_input_df(input_df)
input_df = lfqutils.remove_allnan_rows_input_df(input_df)
input_df = lfqutils.remove_potential_quant_id_duplicates(input_df)

if not deactivate_normalization:
LOGGER.info("Performing sample normalization.")
Expand Down
9 changes: 9 additions & 0 deletions directlfq/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,15 @@ def index_and_log_transform_input_df(data_df):
def remove_allnan_rows_input_df(data_df):
return data_df.dropna(axis = 0, how = 'all')

def remove_potential_quant_id_duplicates(data_df):
before_drop = len(data_df)
data_df = data_df.drop_duplicates(subset=config.QUANT_ID, keep='first')
after_drop = len(data_df)
if before_drop != after_drop:
entries_removed = before_drop - after_drop
LOGGER.info(f"Duplicate quant_ids detected. {entries_removed} rows removed from input df.")


def sort_input_df_by_protein_id(data_df):
return data_df.sort_values(by = config.PROTEIN_ID,ignore_index=True)

Expand Down

0 comments on commit eb7b23e

Please sign in to comment.