Fixes name clash for search function and module; adds final update sc…

…ript
IvanChernyshov · Sep 24, 2024 · 2a4c46f · 2a4c46f
1 parent f57d563
commit 2a4c46f
Show file tree

Hide file tree

Showing 3 changed files with 236 additions and 28 deletions.
diff --git a/nistchempy/__init__.py b/nistchempy/__init__.py
@@ -11,7 +11,7 @@
 
 from nistchempy.compound_list import get_all_data
 from nistchempy.compound import get_compound
-from nistchempy.search import search, NistSearchParameters
+from nistchempy.search import run_search, NistSearchParameters
 from nistchempy.search import print_search_parameters
 
 
diff --git a/nistchempy/search.py b/nistchempy/search.py
@@ -12,29 +12,41 @@
 
 #%% Search parameters helper
 
-def print_search_parameters() -> None:
-    '''
-    Prints available search parameters
+def get_search_parameters() -> _tp.Dict[str, str]:
+    '''Returns search parameters and the corresponding keys
+    
+    Returns:
+        _tp.Dict[str, str]: {short_key => search_parameter}
+    
     '''
     info = {'use_SI': 'Units for thermodynamic data, "SI" if True and "calories" if False',
             'match_isotopes': 'Exactly match the specified isotopes (formula search only)',
             'allow_other': 'Allow elements not specified in formula (formula search only)',
             'allow_extra': 'Allow more atoms of elements in formula than specified (formula search only)',
             'no_ion': 'Exclude ions from the search (formula search only)',
-            'cTG': 'Contains gas-phase thermodynamic data',
-            'cTC': 'Contains condensed-phase thermodynamic data',
-            'cTP': 'Contains phase-change thermodynamic data',
-            'cTR': 'Contains reaction thermodynamic data',
-            'cIE': 'Contains ion energetics thermodynamic data',
-            'cIC': 'Contains ion cluster thermodynamic data',
-            'cIR': 'Contains IR data',
-            'cTZ': 'Contains THz IR data',
-            'cMS': 'Contains MS data',
-            'cUV': 'Contains UV/Vis data',
-            'cGC': 'Contains gas chromatography data',
-            'cES': 'Contains vibrational and electronic energy levels',
-            'cDI': 'Contains constants of diatomic molecules',
-            'cSO': 'Contains info on Henry\'s law'}
+            'cTG': 'Gas phase thermochemistry data',
+            'cTC': 'Condensed phase thermochemistry data',
+            'cTP': 'Phase change data',
+            'cTR': 'Reaction thermochemistry data',
+            'cIE': 'Gas phase ion energetics data',
+            'cIC': 'Ion clustering data',
+            'cIR': 'IR Spectrum',
+            'cTZ': 'THz IR spectrum',
+            'cMS': 'Mass spectrum (electron ionization)',
+            'cUV': 'UV/Visible spectrum',
+            'cGC': 'Gas Chromatography',
+            'cES': 'Vibrational and/or electronic energy levels',
+            'cDI': 'Constants of diatomic molecules',
+            'cSO': 'Henry\'s Law data'}
+
+    return info
+
+
+def print_search_parameters() -> None:
+    '''
+    Prints available search parameters
+    '''
+    info = get_search_parameters()
     max_len = max([len(_) for _ in info])
     spaces = [' '*(max_len - len(_) + 1) for _ in info]
     for (key, val), space in zip(info.items(), spaces):
@@ -170,16 +182,16 @@ def load_found_compounds(self, **kwargs) -> None:
 
 #%% Search
 
-def search(identifier: str, search_type: str,
-           search_parameters: _tp.Optional[NistSearchParameters] = None,
-           use_SI: bool = True, match_isotopes: bool = False,
-           allow_other: bool = False, allow_extra: bool = False,
-           no_ion: bool = False, cTG: bool = False, cTC: bool = False,
-           cTP: bool = False, cTR: bool = False, cIE: bool = False, 
-           cIC: bool = False, cIR: bool = False, cTZ: bool = False, 
-           cMS: bool = False, cUV: bool = False, cGC: bool = False, 
-           cES: bool = False, cDI: bool = False, cSO: bool = False,
-           **kwargs) -> NistSearch:
+def run_search(identifier: str, search_type: str,
+               search_parameters: _tp.Optional[NistSearchParameters] = None,
+               use_SI: bool = True, match_isotopes: bool = False,
+               allow_other: bool = False, allow_extra: bool = False,
+               no_ion: bool = False, cTG: bool = False, cTC: bool = False,
+               cTP: bool = False, cTR: bool = False, cIE: bool = False, 
+               cIC: bool = False, cIR: bool = False, cTZ: bool = False, 
+               cMS: bool = False, cUV: bool = False, cGC: bool = False, 
+               cES: bool = False, cDI: bool = False, cSO: bool = False,
+               **kwargs) -> NistSearch:
     '''Searches compounds in NIST Chemistry WebBook
     
     Arguments:

diff --git a/update/extract_info_from_htmls.py b/update/extract_info_from_htmls.py
@@ -0,0 +1,196 @@
+'''Extracts compound info from previously downloaded HTML-files'''
+
+#%% Imports
+
+import os, argparse, json
+
+from bs4 import BeautifulSoup
+
+import pandas as pd
+
+from tqdm import tqdm
+
+import nistchempy as nist
+
+
+
+#%% Functions
+
+def get_compounds_info(dir_data: str) -> None:
+    '''Extracts compound info from HTML-files
+    
+    Arguments:
+        dir_data (str): root data dump directory
+    
+    '''
+
+    # get list of htmls
+    fs = []
+    for d in ('htmls', 'htmls_stereo'):
+        for f in os.listdir(os.path.join(dir_data, d)):
+            path = os.path.join(dir_data, d, f)
+            fs.append(path)
+
+    # run extraction
+    data = []
+    for f in tqdm(fs):
+        with open(f, 'r') as inpf:
+            soup = BeautifulSoup(inpf.read(), 'html.parser')
+        if not nist.parsing.is_compound_page(soup):
+            continue
+        info = nist.parsing.parse_compound_page(soup)
+        data.append(info)
+
+    # save data
+    path_out = os.path.join(dir_data, 'compounds_data.json')
+    with open(path_out, 'w') as outf:
+        json.dump(data, outf, indent = 2)
+
+    return
+
+
+
+def get_columns(data: list) -> dict:
+    '''Extracts columns from compound data
+    
+    Arguments:
+        data (list): contents of compounds_data.json
+    
+    Returns:
+        list: column names
+    
+    '''
+    cols = [k for k in data[0].keys() if '_refs' not in k]
+    # get unique ref keys
+    keys = {k: set() for k in data[0].keys() if '_refs' in k}
+    for item in data:
+        for k1 in keys:
+            for k2 in item[k1].keys():
+                keys[k1].add(k2)
+    keys = {k: sorted(list(v)) for k, v in keys.items()}
+    # fix data_refs
+    ps = nist.search.get_search_parameters()
+    data_refs = [v for k, v in ps.items() if len(k) == 3]
+    data_refs += [k for k in keys['data_refs'] if len(k) != 3]
+    keys['data_refs'] = data_refs
+    # final columns
+    for k, v in keys.items():
+        cols += v
+
+    return cols
+
+
+
+def prepare_dataset(dir_data: str) -> None:
+    '''Transforms extracted data to nist_data.csv and nist_data_full.csv
+    
+    Arguments:
+        dir_data (str): root data dump directory
+    
+    '''
+
+    # load data
+    path_json = os.path.join(dir_data, 'compounds_data.json')
+    with open(path_json, 'r') as inpf:
+        data = json.load(inpf)
+
+    # prepare
+    ref_keys = [k for k in data[0].keys() if '_refs' in k]
+    ps = nist.search.get_search_parameters()
+    ps = {k: v for k, v in ps.items() if len(k) == 3}
+    cols = get_columns(data)
+    df = []
+
+    # get rows
+    for item in data:
+        add = {k: v for k, v in item.items() if '_refs' not in k}
+        add['synonyms'] = '\\n'.join(add['synonyms'])
+        for k in ref_keys:
+            add.update(item[k])
+        df.append(add)
+
+    # process dataframe
+    df = pd.DataFrame(df)
+    df = df.rename(columns = ps)
+    df = df.sort_values('ID', ignore_index = True)
+    df = df[cols]
+
+    # save
+    path_out = os.path.join(dir_data, 'nist_data.csv')
+    df.to_csv(path_out, index = None)
+
+    return
+
+
+
+#%% Main functions
+
+def get_arguments() -> argparse.Namespace:
+    '''CLI wrapper
+    
+    Returns:
+        argparse.Namespace: CLI arguments
+    
+    '''
+    parser = argparse.ArgumentParser(description = 'Downloads HTML-pages of NIST Chemistry WebBook compounds')
+    parser.add_argument('dir_data',
+                        help = 'directory containing compound.csv file created by get_nist_compounds.py script')
+    args = parser.parse_args()
+
+    return args
+
+
+def check_arguments(args: argparse.Namespace) -> None:
+    '''Tries to create dir_data if it does not exist and raizes error if dir_data is a file
+    
+    Arguments:
+        args (argparse.Namespace): input parameters
+    
+    '''
+    # check root dir
+    if not os.path.exists(args.dir_data):
+        raise ValueError(f'Given dir_data argument does not exist: {args.dir_data}')
+    if not os.path.isdir(args.dir_data):
+        raise ValueError(f'Given dir_data argument is not a directory: {args.dir_data}')
+    # check htmls dir
+    dir_html = os.path.join(args.dir_data, 'htmls')
+    if not os.path.exists(dir_html):
+        raise ValueError('Given dir_data directory does not contain htmls/ folder')
+    # check stereo dir
+    dir_stereo = os.path.join(args.dir_data, 'htmls_stereo')
+    if not os.path.exists(dir_stereo):
+        os.mkdir(dir_stereo)
+
+    return
+
+
+def main() -> None:
+    '''Updates the list of NIST compounds via downloaded HTML pages'''
+
+    # prepare arguments
+    args = get_arguments()
+    check_arguments(args)
+
+    # extract info
+    print('\nExtracting info from HTML-files ...')
+    path_json = os.path.join(args.dir_data, 'compounds_data.json')
+    if not os.path.exists(path_json):
+        get_compounds_info(args.dir_data)
+
+    # transform to dataframes
+    print('\nTransforming to dataframe ...')
+    prepare_dataset(args.dir_data)
+    print()
+
+    return
+
+
+
+#%% Main
+
+if __name__ == '__main__':
+
+    main()
+
+
+