-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update code to standardize affiliations
- Loading branch information
Showing
8 changed files
with
3,616 additions
and
3,565 deletions.
There are no files selected for viewing
1,028 changes: 514 additions & 514 deletions
1,028
data/1-marinespecies.org_users_2023-11-07_abstract.csv
Large diffs are not rendered by default.
Oops, something went wrong.
408 changes: 204 additions & 204 deletions
408
data/2-mailinglist_lifewatch.be_users_2023-11-07_abstract.csv
Large diffs are not rendered by default.
Oops, something went wrong.
340 changes: 170 additions & 170 deletions
340
data/2-mailinglist_lifewatch.be_users_2023-11-07_standardized_infoadded.csv
Large diffs are not rendered by default.
Oops, something went wrong.
4,800 changes: 2,400 additions & 2,400 deletions
4,800
data/3-users_lifewatch_by_group_kpi_2023-11-07_abstract.csv
Large diffs are not rendered by default.
Oops, something went wrong.
446 changes: 223 additions & 223 deletions
446
data/3-users_lifewatch_by_group_kpi_2023-11-07_standardized_infoadded.csv
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,25 @@ | ||
## add additional info to standardized institutes/affiliations | ||
import pandas as pd | ||
import os | ||
from pathlib import Path | ||
|
||
CURRENTPATH = os.path.dirname(os.path.realpath(__file__)) | ||
PROJECTPATH = os.path.abspath(os.path.join(CURRENTPATH, '..', '..', '..')) | ||
PROJECTPATH = Path.cwd() | ||
REFINFOPATH = PROJECTPATH / 'data' / 'reference_data' / 'AffiliationInfo.csv' | ||
FOLDERPATH = PROJECTPATH / 'data' | ||
FILEPATHS = [x for x in FOLDERPATH.iterdir() if x.stem.endswith('_standardized')] | ||
|
||
#Load reference files | ||
REFPATH = os.path.join(PROJECTPATH, 'data', 'reference_data') | ||
affil_info = pd.read_csv(os.path.join(REFPATH, 'AffiliationInfo.csv')) | ||
affil_info = pd.read_csv(REFINFOPATH) | ||
affil_info.stand_institute.astype(str) | ||
|
||
#Load '*_standardized.csv' input files | ||
FOLDERPATH = os.path.join(PROJECTPATH, 'data') | ||
files = [item for item in os.listdir(FOLDERPATH) if item.endswith('_standardized.csv')] | ||
for filepath in FILEPATHS: | ||
filename = filepath.stem | ||
new_filename = filename.replace('_standardized', '_standardized_infoadded.csv') | ||
|
||
for file in files: | ||
filename = os.path.splitext(os.path.basename(file))[0] | ||
filepath = os.path.join(FOLDERPATH, file) | ||
df = pd.read_csv(filepath, delimiter=',') | ||
|
||
#Standardize insitute names | ||
df['stand_institute'] = df['stand_institute'].fillna('NA') # because NaN is of type Int64 and can't merge between different types | ||
merged_df = pd.merge(df, affil_info, on='stand_institute', how='left', suffixes=('_', '_info')) | ||
|
||
#Write to file | ||
merged_df.to_csv(os.path.join(FOLDERPATH, filename.replace('_standardized', '_standardized_infoadded.csv')), index=False) | ||
merged_df.to_csv(FOLDERPATH, new_filename, index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,66 @@ | ||
## Standardize the institute names | ||
import pandas as pd | ||
import os | ||
from pathlib import Path | ||
from collections import defaultdict | ||
|
||
CURRENTPATH = os.path.dirname(os.path.realpath(__file__)) | ||
PROJECTPATH = os.path.abspath(os.path.join(CURRENTPATH, '..', '..', '..')) | ||
#Functions | ||
def make_mapping_dict(affil_mapping: pd.DataFrame) -> dict: | ||
|
||
#Load reference files | ||
REFPATH = os.path.join(PROJECTPATH, 'data', 'reference_data') | ||
affil_mapping = pd.read_csv(os.path.join(REFPATH, 'AffiliationMappingFile.csv')) | ||
""" | ||
Turn the reference mapping file into a dictionary with structure: | ||
{stand-affiliation-name : [all possible ways of writing affil name] } | ||
""" | ||
|
||
# turn mapping file into dictionary | ||
mapping_dct = defaultdict(list) | ||
[mapping_dct[row['Institute_standardized']].append(row['Institute']) for index, row in affil_mapping.iterrows()] | ||
#print(mapping_dct['Flanders Marine Institute (VLIZ)']) | ||
mapping_dct = defaultdict(list) | ||
return [mapping_dct[row['Institute_standardized']].append(row['Institute']) for index, row in affil_mapping.iterrows()] | ||
|
||
#Load '*_abstract' input files | ||
FOLDERPATH = os.path.join(PROJECTPATH, 'data') | ||
files = [item for item in os.listdir(FOLDERPATH) if item.endswith('_abstract.csv')] | ||
|
||
for file in files: | ||
filename = os.path.splitext(os.path.basename(file))[0] | ||
filepath = os.path.join(FOLDERPATH, file) | ||
df = pd.read_csv(filepath, delimiter=',') | ||
def standardize_affiliation_names(df: pd.DataFrame, mapping_dct: dict) -> pd.DataFrame: | ||
|
||
#Standardize insitute names | ||
print(f"standardizing {filename}...") | ||
""" | ||
Add standardized insitute names to df using as mapping dict | ||
returns | ||
df with standard institute names added in a new column | ||
df consisting of institute names that couldn't be standardized yet | ||
""" | ||
|
||
for index, row in df.iterrows(): | ||
for stand_inst, inst_list in mapping_dct.items(): | ||
if row['raw_institute'] in inst_list or str(row['raw_institute']).lower() in inst_list: | ||
df.at[index, 'stand_institute'] = stand_inst | ||
print("done!") | ||
|
||
# write to file | ||
df.to_csv(os.path.join(FOLDERPATH, filename.replace('_abstract', '_standardized.csv')), index=False) | ||
|
||
# write non-standardized institutes to separate file for manual check | ||
df_tostand = df.loc[df['stand_institute'].isnull()] | ||
df_tostand = df_tostand.drop_duplicates() | ||
df_tostand.to_csv(os.path.join(FOLDERPATH, filename.replace('_abstract', '_to_standardize.csv')), index=False) | ||
|
||
return df, df_tostand | ||
|
||
|
||
# CODE | ||
PROJECTPATH = Path.cwd() | ||
FOLDERPATH = PROJECTPATH / 'data' | ||
FILEPATHS = [x for x in FOLDERPATH.iterdir() if x.stem.endswith('_abstract')] | ||
REFAFFILPATH = PROJECTPATH / 'data' / 'reference_data' / 'AffiliationMappingFile.csv' | ||
|
||
affil_mapping = pd.read_csv(REFAFFILPATH) | ||
mapping_dct = make_mapping_dict(affil_mapping) | ||
#print(mapping_dct['Flanders Marine Institute (VLIZ)']) | ||
|
||
for filepath in FILEPATHS: | ||
filename = filepath.stem | ||
filename_stand = filename.replace('_abstract', '_standardized.csv') | ||
filepath_stand = Path(FOLDERPATH, filename_stand) | ||
filename_to_stand = filename.replace('_abstract', '_to_standardize.csv') | ||
filepath_to_stand = Path(FOLDERPATH, filename_to_stand) | ||
|
||
#read input | ||
df = pd.read_csv(filepath, delimiter=',') | ||
|
||
#standardize affiliation names | ||
print(f"standardizing {filename}...") | ||
df_stand, df_tostand = standardize_affiliation_names(df, mapping_dct) | ||
print("done!") | ||
|
||
# write df to file, and subset of non-stand names to seperate file for manual check | ||
df_stand.to_csv(filepath_stand, index=False) | ||
df_tostand.to_csv(filepath_to_stand, index=False) |