Skip to content

Commit

Permalink
Update code to standardize affiliations
Browse files Browse the repository at this point in the history
  • Loading branch information
laurianvm committed Dec 4, 2023
1 parent d00b458 commit 1ffa54d
Show file tree
Hide file tree
Showing 8 changed files with 3,616 additions and 3,565 deletions.
1,028 changes: 514 additions & 514 deletions data/1-marinespecies.org_users_2023-11-07_abstract.csv

Large diffs are not rendered by default.

408 changes: 204 additions & 204 deletions data/2-mailinglist_lifewatch.be_users_2023-11-07_abstract.csv

Large diffs are not rendered by default.

Large diffs are not rendered by default.

4,800 changes: 2,400 additions & 2,400 deletions data/3-users_lifewatch_by_group_kpi_2023-11-07_abstract.csv

Large diffs are not rendered by default.

446 changes: 223 additions & 223 deletions data/3-users_lifewatch_by_group_kpi_2023-11-07_standardized_infoadded.csv

Large diffs are not rendered by default.

22 changes: 10 additions & 12 deletions src/py/lwua-ingest/add_info.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,25 @@
## add additional info to standardized institutes/affiliations
import pandas as pd
import os
from pathlib import Path

CURRENTPATH = os.path.dirname(os.path.realpath(__file__))
PROJECTPATH = os.path.abspath(os.path.join(CURRENTPATH, '..', '..', '..'))
PROJECTPATH = Path.cwd()
REFINFOPATH = PROJECTPATH / 'data' / 'reference_data' / 'AffiliationInfo.csv'
FOLDERPATH = PROJECTPATH / 'data'
FILEPATHS = [x for x in FOLDERPATH.iterdir() if x.stem.endswith('_standardized')]

#Load reference files
REFPATH = os.path.join(PROJECTPATH, 'data', 'reference_data')
affil_info = pd.read_csv(os.path.join(REFPATH, 'AffiliationInfo.csv'))
affil_info = pd.read_csv(REFINFOPATH)
affil_info.stand_institute.astype(str)

#Load '*_standardized.csv' input files
FOLDERPATH = os.path.join(PROJECTPATH, 'data')
files = [item for item in os.listdir(FOLDERPATH) if item.endswith('_standardized.csv')]
for filepath in FILEPATHS:
filename = filepath.stem
new_filename = filename.replace('_standardized', '_standardized_infoadded.csv')

for file in files:
filename = os.path.splitext(os.path.basename(file))[0]
filepath = os.path.join(FOLDERPATH, file)
df = pd.read_csv(filepath, delimiter=',')

#Standardize insitute names
df['stand_institute'] = df['stand_institute'].fillna('NA') # because NaN is of type Int64 and can't merge between different types
merged_df = pd.merge(df, affil_info, on='stand_institute', how='left', suffixes=('_', '_info'))

#Write to file
merged_df.to_csv(os.path.join(FOLDERPATH, filename.replace('_standardized', '_standardized_infoadded.csv')), index=False)
merged_df.to_csv(FOLDERPATH, new_filename, index=False)
62 changes: 45 additions & 17 deletions src/py/lwua-ingest/raw_to_abstract.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,32 @@
## Turn raw input files into files with institutes that can be standardized & made public
import pandas as pd
import os
from pathlib import Path
import uuid

CURRENTPATH = os.path.dirname(os.path.realpath(__file__))
PROJECTPATH = os.path.abspath(os.path.join(CURRENTPATH, '..', '..', '..'))
#Functions
def read_input(filepath: pathlib.PosixPath) -> pd.DataFrame:

#Read input
FOLDERPATH = os.path.join(PROJECTPATH, 'data', '_LWUA_DataSystems_RawInput')
files = [item for item in os.listdir(FOLDERPATH) if os.path.isfile(os.path.join(FOLDERPATH, item))]
"""
Takes input file and returns Dataframe with filename added as column
"""

for file in files:
filename = os.path.splitext(os.path.basename(file))[0]
filepath = os.path.join(FOLDERPATH, file)

if file.endswith('.csv'):
if filepath.suffix == '.csv':
df = pd.read_csv(filepath, delimiter=';') # Use pd.read_excel() for Excel files
if file.endswith('.txt'):
if filepath.suffix == '.txt':
df = pd.read_csv(filepath, delimiter='\t')

df['source'] = filepath.stem

return df


def annonymize_input(df: pd.DataFrame) -> pd.DataFrame:

"""
Takes an pd.Dataframe containing personal information and returns the annonimized version of it
"""

df.columns = df.columns.str.lower()

#Make df --> to standardize & that can be made public
df_ = pd.DataFrame()
if 'email' in df.columns:
_mailEnds = [email_lst[-1] for email_lst in df['email'].str.split("@")]
Expand All @@ -32,7 +38,29 @@
if 'institute' in df.columns:
df_['raw_institute'] = df['institute']

df_['raw_source'] = filename
df_['identifier'] = [uuid.uuid4() for _ in range(len(df_))]

return df_


def write_to_csv(df, filepath: pathlib.PosixPath) -> None:

"""
Writes dataframe to csv file
"""

df.to_csv(filepath, index=False)


# CODE
PROJECTPATH = Path.cwd()
FOLDERPATH = PROJECTPATH / 'data' / '_LWUA_DataSystems_RawInput'
FILEPATHS = [x for x in FOLDERPATH.iterdir() if x.is_file()]

for filepath in FILEPATHS:
new_filename = filepath.stem + '_abstract.csv'
new_filepath = PROJECTPATH / 'data' / new_filename

#write to new files in data folder
df_.to_csv(os.path.join(PROJECTPATH, 'data', filename+'_abstract.csv'), index=False)
df = read_input(filepath)
df_ = annonymize_input(df)
write_to_csv(df_, new_filepath)
75 changes: 50 additions & 25 deletions src/py/lwua-ingest/standardize.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,66 @@
## Standardize the institute names
import pandas as pd
import os
from pathlib import Path
from collections import defaultdict

CURRENTPATH = os.path.dirname(os.path.realpath(__file__))
PROJECTPATH = os.path.abspath(os.path.join(CURRENTPATH, '..', '..', '..'))
#Functions
def make_mapping_dict(affil_mapping: pd.DataFrame) -> dict:

#Load reference files
REFPATH = os.path.join(PROJECTPATH, 'data', 'reference_data')
affil_mapping = pd.read_csv(os.path.join(REFPATH, 'AffiliationMappingFile.csv'))
"""
Turn the reference mapping file into a dictionary with structure:
{stand-affiliation-name : [all possible ways of writing affil name] }
"""

# turn mapping file into dictionary
mapping_dct = defaultdict(list)
[mapping_dct[row['Institute_standardized']].append(row['Institute']) for index, row in affil_mapping.iterrows()]
#print(mapping_dct['Flanders Marine Institute (VLIZ)'])
mapping_dct = defaultdict(list)
return [mapping_dct[row['Institute_standardized']].append(row['Institute']) for index, row in affil_mapping.iterrows()]

#Load '*_abstract' input files
FOLDERPATH = os.path.join(PROJECTPATH, 'data')
files = [item for item in os.listdir(FOLDERPATH) if item.endswith('_abstract.csv')]

for file in files:
filename = os.path.splitext(os.path.basename(file))[0]
filepath = os.path.join(FOLDERPATH, file)
df = pd.read_csv(filepath, delimiter=',')
def standardize_affiliation_names(df: pd.DataFrame, mapping_dct: dict) -> pd.DataFrame:

#Standardize insitute names
print(f"standardizing {filename}...")
"""
Add standardized insitute names to df using as mapping dict
returns
df with standard institute names added in a new column
df consisting of institute names that couldn't be standardized yet
"""

for index, row in df.iterrows():
for stand_inst, inst_list in mapping_dct.items():
if row['raw_institute'] in inst_list or str(row['raw_institute']).lower() in inst_list:
df.at[index, 'stand_institute'] = stand_inst
print("done!")

# write to file
df.to_csv(os.path.join(FOLDERPATH, filename.replace('_abstract', '_standardized.csv')), index=False)

# write non-standardized institutes to separate file for manual check
df_tostand = df.loc[df['stand_institute'].isnull()]
df_tostand = df_tostand.drop_duplicates()
df_tostand.to_csv(os.path.join(FOLDERPATH, filename.replace('_abstract', '_to_standardize.csv')), index=False)

return df, df_tostand


# CODE
PROJECTPATH = Path.cwd()
FOLDERPATH = PROJECTPATH / 'data'
FILEPATHS = [x for x in FOLDERPATH.iterdir() if x.stem.endswith('_abstract')]
REFAFFILPATH = PROJECTPATH / 'data' / 'reference_data' / 'AffiliationMappingFile.csv'

affil_mapping = pd.read_csv(REFAFFILPATH)
mapping_dct = make_mapping_dict(affil_mapping)
#print(mapping_dct['Flanders Marine Institute (VLIZ)'])

for filepath in FILEPATHS:
filename = filepath.stem
filename_stand = filename.replace('_abstract', '_standardized.csv')
filepath_stand = Path(FOLDERPATH, filename_stand)
filename_to_stand = filename.replace('_abstract', '_to_standardize.csv')
filepath_to_stand = Path(FOLDERPATH, filename_to_stand)

#read input
df = pd.read_csv(filepath, delimiter=',')

#standardize affiliation names
print(f"standardizing {filename}...")
df_stand, df_tostand = standardize_affiliation_names(df, mapping_dct)
print("done!")

# write df to file, and subset of non-stand names to seperate file for manual check
df_stand.to_csv(filepath_stand, index=False)
df_tostand.to_csv(filepath_to_stand, index=False)

0 comments on commit 1ffa54d

Please sign in to comment.