diff --git a/nistchempy/__init__.py b/nistchempy/__init__.py index 71b573a..aaa4d21 100644 --- a/nistchempy/__init__.py +++ b/nistchempy/__init__.py @@ -11,7 +11,7 @@ from nistchempy.compound_list import get_all_data from nistchempy.compound import get_compound -from nistchempy.search import search, NistSearchParameters +from nistchempy.search import run_search, NistSearchParameters from nistchempy.search import print_search_parameters diff --git a/nistchempy/search.py b/nistchempy/search.py index d0ce942..f281c8d 100644 --- a/nistchempy/search.py +++ b/nistchempy/search.py @@ -12,29 +12,41 @@ #%% Search parameters helper -def print_search_parameters() -> None: - ''' - Prints available search parameters +def get_search_parameters() -> _tp.Dict[str, str]: + '''Returns search parameters and the corresponding keys + + Returns: + _tp.Dict[str, str]: {short_key => search_parameter} + ''' info = {'use_SI': 'Units for thermodynamic data, "SI" if True and "calories" if False', 'match_isotopes': 'Exactly match the specified isotopes (formula search only)', 'allow_other': 'Allow elements not specified in formula (formula search only)', 'allow_extra': 'Allow more atoms of elements in formula than specified (formula search only)', 'no_ion': 'Exclude ions from the search (formula search only)', - 'cTG': 'Contains gas-phase thermodynamic data', - 'cTC': 'Contains condensed-phase thermodynamic data', - 'cTP': 'Contains phase-change thermodynamic data', - 'cTR': 'Contains reaction thermodynamic data', - 'cIE': 'Contains ion energetics thermodynamic data', - 'cIC': 'Contains ion cluster thermodynamic data', - 'cIR': 'Contains IR data', - 'cTZ': 'Contains THz IR data', - 'cMS': 'Contains MS data', - 'cUV': 'Contains UV/Vis data', - 'cGC': 'Contains gas chromatography data', - 'cES': 'Contains vibrational and electronic energy levels', - 'cDI': 'Contains constants of diatomic molecules', - 'cSO': 'Contains info on Henry\'s law'} + 'cTG': 'Gas phase thermochemistry data', + 'cTC': 'Condensed phase thermochemistry data', + 'cTP': 'Phase change data', + 'cTR': 'Reaction thermochemistry data', + 'cIE': 'Gas phase ion energetics data', + 'cIC': 'Ion clustering data', + 'cIR': 'IR Spectrum', + 'cTZ': 'THz IR spectrum', + 'cMS': 'Mass spectrum (electron ionization)', + 'cUV': 'UV/Visible spectrum', + 'cGC': 'Gas Chromatography', + 'cES': 'Vibrational and/or electronic energy levels', + 'cDI': 'Constants of diatomic molecules', + 'cSO': 'Henry\'s Law data'} + + return info + + +def print_search_parameters() -> None: + ''' + Prints available search parameters + ''' + info = get_search_parameters() max_len = max([len(_) for _ in info]) spaces = [' '*(max_len - len(_) + 1) for _ in info] for (key, val), space in zip(info.items(), spaces): @@ -170,16 +182,16 @@ def load_found_compounds(self, **kwargs) -> None: #%% Search -def search(identifier: str, search_type: str, - search_parameters: _tp.Optional[NistSearchParameters] = None, - use_SI: bool = True, match_isotopes: bool = False, - allow_other: bool = False, allow_extra: bool = False, - no_ion: bool = False, cTG: bool = False, cTC: bool = False, - cTP: bool = False, cTR: bool = False, cIE: bool = False, - cIC: bool = False, cIR: bool = False, cTZ: bool = False, - cMS: bool = False, cUV: bool = False, cGC: bool = False, - cES: bool = False, cDI: bool = False, cSO: bool = False, - **kwargs) -> NistSearch: +def run_search(identifier: str, search_type: str, + search_parameters: _tp.Optional[NistSearchParameters] = None, + use_SI: bool = True, match_isotopes: bool = False, + allow_other: bool = False, allow_extra: bool = False, + no_ion: bool = False, cTG: bool = False, cTC: bool = False, + cTP: bool = False, cTR: bool = False, cIE: bool = False, + cIC: bool = False, cIR: bool = False, cTZ: bool = False, + cMS: bool = False, cUV: bool = False, cGC: bool = False, + cES: bool = False, cDI: bool = False, cSO: bool = False, + **kwargs) -> NistSearch: '''Searches compounds in NIST Chemistry WebBook Arguments: diff --git a/update/extract_info_from_htmls.py b/update/extract_info_from_htmls.py new file mode 100644 index 0000000..1f8cdd1 --- /dev/null +++ b/update/extract_info_from_htmls.py @@ -0,0 +1,196 @@ +'''Extracts compound info from previously downloaded HTML-files''' + +#%% Imports + +import os, argparse, json + +from bs4 import BeautifulSoup + +import pandas as pd + +from tqdm import tqdm + +import nistchempy as nist + + + +#%% Functions + +def get_compounds_info(dir_data: str) -> None: + '''Extracts compound info from HTML-files + + Arguments: + dir_data (str): root data dump directory + + ''' + + # get list of htmls + fs = [] + for d in ('htmls', 'htmls_stereo'): + for f in os.listdir(os.path.join(dir_data, d)): + path = os.path.join(dir_data, d, f) + fs.append(path) + + # run extraction + data = [] + for f in tqdm(fs): + with open(f, 'r') as inpf: + soup = BeautifulSoup(inpf.read(), 'html.parser') + if not nist.parsing.is_compound_page(soup): + continue + info = nist.parsing.parse_compound_page(soup) + data.append(info) + + # save data + path_out = os.path.join(dir_data, 'compounds_data.json') + with open(path_out, 'w') as outf: + json.dump(data, outf, indent = 2) + + return + + + +def get_columns(data: list) -> dict: + '''Extracts columns from compound data + + Arguments: + data (list): contents of compounds_data.json + + Returns: + list: column names + + ''' + cols = [k for k in data[0].keys() if '_refs' not in k] + # get unique ref keys + keys = {k: set() for k in data[0].keys() if '_refs' in k} + for item in data: + for k1 in keys: + for k2 in item[k1].keys(): + keys[k1].add(k2) + keys = {k: sorted(list(v)) for k, v in keys.items()} + # fix data_refs + ps = nist.search.get_search_parameters() + data_refs = [v for k, v in ps.items() if len(k) == 3] + data_refs += [k for k in keys['data_refs'] if len(k) != 3] + keys['data_refs'] = data_refs + # final columns + for k, v in keys.items(): + cols += v + + return cols + + + +def prepare_dataset(dir_data: str) -> None: + '''Transforms extracted data to nist_data.csv and nist_data_full.csv + + Arguments: + dir_data (str): root data dump directory + + ''' + + # load data + path_json = os.path.join(dir_data, 'compounds_data.json') + with open(path_json, 'r') as inpf: + data = json.load(inpf) + + # prepare + ref_keys = [k for k in data[0].keys() if '_refs' in k] + ps = nist.search.get_search_parameters() + ps = {k: v for k, v in ps.items() if len(k) == 3} + cols = get_columns(data) + df = [] + + # get rows + for item in data: + add = {k: v for k, v in item.items() if '_refs' not in k} + add['synonyms'] = '\\n'.join(add['synonyms']) + for k in ref_keys: + add.update(item[k]) + df.append(add) + + # process dataframe + df = pd.DataFrame(df) + df = df.rename(columns = ps) + df = df.sort_values('ID', ignore_index = True) + df = df[cols] + + # save + path_out = os.path.join(dir_data, 'nist_data.csv') + df.to_csv(path_out, index = None) + + return + + + +#%% Main functions + +def get_arguments() -> argparse.Namespace: + '''CLI wrapper + + Returns: + argparse.Namespace: CLI arguments + + ''' + parser = argparse.ArgumentParser(description = 'Downloads HTML-pages of NIST Chemistry WebBook compounds') + parser.add_argument('dir_data', + help = 'directory containing compound.csv file created by get_nist_compounds.py script') + args = parser.parse_args() + + return args + + +def check_arguments(args: argparse.Namespace) -> None: + '''Tries to create dir_data if it does not exist and raizes error if dir_data is a file + + Arguments: + args (argparse.Namespace): input parameters + + ''' + # check root dir + if not os.path.exists(args.dir_data): + raise ValueError(f'Given dir_data argument does not exist: {args.dir_data}') + if not os.path.isdir(args.dir_data): + raise ValueError(f'Given dir_data argument is not a directory: {args.dir_data}') + # check htmls dir + dir_html = os.path.join(args.dir_data, 'htmls') + if not os.path.exists(dir_html): + raise ValueError('Given dir_data directory does not contain htmls/ folder') + # check stereo dir + dir_stereo = os.path.join(args.dir_data, 'htmls_stereo') + if not os.path.exists(dir_stereo): + os.mkdir(dir_stereo) + + return + + +def main() -> None: + '''Updates the list of NIST compounds via downloaded HTML pages''' + + # prepare arguments + args = get_arguments() + check_arguments(args) + + # extract info + print('\nExtracting info from HTML-files ...') + path_json = os.path.join(args.dir_data, 'compounds_data.json') + if not os.path.exists(path_json): + get_compounds_info(args.dir_data) + + # transform to dataframes + print('\nTransforming to dataframe ...') + prepare_dataset(args.dir_data) + print() + + return + + + +#%% Main + +if __name__ == '__main__': + + main() + + +