Skip to content

Commit

Permalink
Fixes name clash for search function and module; adds final update sc…
Browse files Browse the repository at this point in the history
…ript
  • Loading branch information
IvanChernyshov committed Sep 24, 2024
1 parent f57d563 commit 2a4c46f
Show file tree
Hide file tree
Showing 3 changed files with 236 additions and 28 deletions.
2 changes: 1 addition & 1 deletion nistchempy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from nistchempy.compound_list import get_all_data
from nistchempy.compound import get_compound
from nistchempy.search import search, NistSearchParameters
from nistchempy.search import run_search, NistSearchParameters
from nistchempy.search import print_search_parameters


66 changes: 39 additions & 27 deletions nistchempy/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,41 @@

#%% Search parameters helper

def print_search_parameters() -> None:
'''
Prints available search parameters
def get_search_parameters() -> _tp.Dict[str, str]:
'''Returns search parameters and the corresponding keys
Returns:
_tp.Dict[str, str]: {short_key => search_parameter}
'''
info = {'use_SI': 'Units for thermodynamic data, "SI" if True and "calories" if False',
'match_isotopes': 'Exactly match the specified isotopes (formula search only)',
'allow_other': 'Allow elements not specified in formula (formula search only)',
'allow_extra': 'Allow more atoms of elements in formula than specified (formula search only)',
'no_ion': 'Exclude ions from the search (formula search only)',
'cTG': 'Contains gas-phase thermodynamic data',
'cTC': 'Contains condensed-phase thermodynamic data',
'cTP': 'Contains phase-change thermodynamic data',
'cTR': 'Contains reaction thermodynamic data',
'cIE': 'Contains ion energetics thermodynamic data',
'cIC': 'Contains ion cluster thermodynamic data',
'cIR': 'Contains IR data',
'cTZ': 'Contains THz IR data',
'cMS': 'Contains MS data',
'cUV': 'Contains UV/Vis data',
'cGC': 'Contains gas chromatography data',
'cES': 'Contains vibrational and electronic energy levels',
'cDI': 'Contains constants of diatomic molecules',
'cSO': 'Contains info on Henry\'s law'}
'cTG': 'Gas phase thermochemistry data',
'cTC': 'Condensed phase thermochemistry data',
'cTP': 'Phase change data',
'cTR': 'Reaction thermochemistry data',
'cIE': 'Gas phase ion energetics data',
'cIC': 'Ion clustering data',
'cIR': 'IR Spectrum',
'cTZ': 'THz IR spectrum',
'cMS': 'Mass spectrum (electron ionization)',
'cUV': 'UV/Visible spectrum',
'cGC': 'Gas Chromatography',
'cES': 'Vibrational and/or electronic energy levels',
'cDI': 'Constants of diatomic molecules',
'cSO': 'Henry\'s Law data'}

return info


def print_search_parameters() -> None:
'''
Prints available search parameters
'''
info = get_search_parameters()
max_len = max([len(_) for _ in info])
spaces = [' '*(max_len - len(_) + 1) for _ in info]
for (key, val), space in zip(info.items(), spaces):
Expand Down Expand Up @@ -170,16 +182,16 @@ def load_found_compounds(self, **kwargs) -> None:

#%% Search

def search(identifier: str, search_type: str,
search_parameters: _tp.Optional[NistSearchParameters] = None,
use_SI: bool = True, match_isotopes: bool = False,
allow_other: bool = False, allow_extra: bool = False,
no_ion: bool = False, cTG: bool = False, cTC: bool = False,
cTP: bool = False, cTR: bool = False, cIE: bool = False,
cIC: bool = False, cIR: bool = False, cTZ: bool = False,
cMS: bool = False, cUV: bool = False, cGC: bool = False,
cES: bool = False, cDI: bool = False, cSO: bool = False,
**kwargs) -> NistSearch:
def run_search(identifier: str, search_type: str,
search_parameters: _tp.Optional[NistSearchParameters] = None,
use_SI: bool = True, match_isotopes: bool = False,
allow_other: bool = False, allow_extra: bool = False,
no_ion: bool = False, cTG: bool = False, cTC: bool = False,
cTP: bool = False, cTR: bool = False, cIE: bool = False,
cIC: bool = False, cIR: bool = False, cTZ: bool = False,
cMS: bool = False, cUV: bool = False, cGC: bool = False,
cES: bool = False, cDI: bool = False, cSO: bool = False,
**kwargs) -> NistSearch:
'''Searches compounds in NIST Chemistry WebBook
Arguments:
Expand Down
196 changes: 196 additions & 0 deletions update/extract_info_from_htmls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
'''Extracts compound info from previously downloaded HTML-files'''

#%% Imports

import os, argparse, json

from bs4 import BeautifulSoup

import pandas as pd

from tqdm import tqdm

import nistchempy as nist



#%% Functions

def get_compounds_info(dir_data: str) -> None:
'''Extracts compound info from HTML-files
Arguments:
dir_data (str): root data dump directory
'''

# get list of htmls
fs = []
for d in ('htmls', 'htmls_stereo'):
for f in os.listdir(os.path.join(dir_data, d)):
path = os.path.join(dir_data, d, f)
fs.append(path)

# run extraction
data = []
for f in tqdm(fs):
with open(f, 'r') as inpf:
soup = BeautifulSoup(inpf.read(), 'html.parser')
if not nist.parsing.is_compound_page(soup):
continue
info = nist.parsing.parse_compound_page(soup)
data.append(info)

# save data
path_out = os.path.join(dir_data, 'compounds_data.json')
with open(path_out, 'w') as outf:
json.dump(data, outf, indent = 2)

return



def get_columns(data: list) -> dict:
'''Extracts columns from compound data
Arguments:
data (list): contents of compounds_data.json
Returns:
list: column names
'''
cols = [k for k in data[0].keys() if '_refs' not in k]
# get unique ref keys
keys = {k: set() for k in data[0].keys() if '_refs' in k}
for item in data:
for k1 in keys:
for k2 in item[k1].keys():
keys[k1].add(k2)
keys = {k: sorted(list(v)) for k, v in keys.items()}
# fix data_refs
ps = nist.search.get_search_parameters()
data_refs = [v for k, v in ps.items() if len(k) == 3]
data_refs += [k for k in keys['data_refs'] if len(k) != 3]
keys['data_refs'] = data_refs
# final columns
for k, v in keys.items():
cols += v

return cols



def prepare_dataset(dir_data: str) -> None:
'''Transforms extracted data to nist_data.csv and nist_data_full.csv
Arguments:
dir_data (str): root data dump directory
'''

# load data
path_json = os.path.join(dir_data, 'compounds_data.json')
with open(path_json, 'r') as inpf:
data = json.load(inpf)

# prepare
ref_keys = [k for k in data[0].keys() if '_refs' in k]
ps = nist.search.get_search_parameters()
ps = {k: v for k, v in ps.items() if len(k) == 3}
cols = get_columns(data)
df = []

# get rows
for item in data:
add = {k: v for k, v in item.items() if '_refs' not in k}
add['synonyms'] = '\\n'.join(add['synonyms'])
for k in ref_keys:
add.update(item[k])
df.append(add)

# process dataframe
df = pd.DataFrame(df)
df = df.rename(columns = ps)
df = df.sort_values('ID', ignore_index = True)
df = df[cols]

# save
path_out = os.path.join(dir_data, 'nist_data.csv')
df.to_csv(path_out, index = None)

return



#%% Main functions

def get_arguments() -> argparse.Namespace:
'''CLI wrapper
Returns:
argparse.Namespace: CLI arguments
'''
parser = argparse.ArgumentParser(description = 'Downloads HTML-pages of NIST Chemistry WebBook compounds')
parser.add_argument('dir_data',
help = 'directory containing compound.csv file created by get_nist_compounds.py script')
args = parser.parse_args()

return args


def check_arguments(args: argparse.Namespace) -> None:
'''Tries to create dir_data if it does not exist and raizes error if dir_data is a file
Arguments:
args (argparse.Namespace): input parameters
'''
# check root dir
if not os.path.exists(args.dir_data):
raise ValueError(f'Given dir_data argument does not exist: {args.dir_data}')
if not os.path.isdir(args.dir_data):
raise ValueError(f'Given dir_data argument is not a directory: {args.dir_data}')
# check htmls dir
dir_html = os.path.join(args.dir_data, 'htmls')
if not os.path.exists(dir_html):
raise ValueError('Given dir_data directory does not contain htmls/ folder')
# check stereo dir
dir_stereo = os.path.join(args.dir_data, 'htmls_stereo')
if not os.path.exists(dir_stereo):
os.mkdir(dir_stereo)

return


def main() -> None:
'''Updates the list of NIST compounds via downloaded HTML pages'''

# prepare arguments
args = get_arguments()
check_arguments(args)

# extract info
print('\nExtracting info from HTML-files ...')
path_json = os.path.join(args.dir_data, 'compounds_data.json')
if not os.path.exists(path_json):
get_compounds_info(args.dir_data)

# transform to dataframes
print('\nTransforming to dataframe ...')
prepare_dataset(args.dir_data)
print()

return



#%% Main

if __name__ == '__main__':

main()



0 comments on commit 2a4c46f

Please sign in to comment.