Skip to content

Commit

Permalink
Adds update script checking compound initialization
Browse files Browse the repository at this point in the history
  • Loading branch information
IvanChernyshov committed Sep 22, 2024
1 parent 503ac5c commit 4fafe14
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 1 deletion.
10 changes: 9 additions & 1 deletion nistchempy/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,15 @@
#%% Search

def get_found_compounds(soup: _bs4.BeautifulSoup) -> dict:
''' '''
'''Extracts IDs of found compounds for NIST Chemistry WebBook search
Arguments:
soup (_bs4.BeautifulSoup): bs4-parsed web-page
Returns:
dict: extracted NIST search parameters
'''
try:
refs = soup.find('ol').findChildren('a', href = _re.compile('/cgi/cbook.cgi'))
IDs = [_uparse.parse_qs(_uparse.urlparse(a.attrs['href']).query)['ID'][0] \
Expand Down
134 changes: 134 additions & 0 deletions update/check_compound_initialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
'''Check compound initialization using preloaded html-files'''

#%% Imports

import os, argparse

from tqdm import tqdm

import pandas as pd

from bs4 import BeautifulSoup

import nistchempy as nist

from typing import List


#%% Functions

def check_soup(soup: BeautifulSoup) -> bool:
'''Returns False if something is wrong with the compound's soup
Arguments:
soup (BeautifulSoup): bs4-parsed web-page
Returns:
bool: True if compound page is OK
'''
if not nist.parsing.is_compound_page(soup):
return False
# extract data
info = {**nist.parsing.parse_compound_page(soup),
'nist_response': None}
X = nist.compound.NistCompound(**info)

return X is not None


def get_unreadable_compounds(dir_html: str) -> List[str]:
'''Iterates through loaded HTML-files and returns names of non-readable ones
Arguments:
dir_html: path to the directory containing compound HTML-files
Returns:
List[str]: list of non-readable HTML files corresponding to compounds.csv row indexes
'''
errors = []
# cycle over files
fs = os.listdir(dir_html)
for f in tqdm(fs, total = len(fs)):
# prepare
idx = int(f.replace('.html', ''))
path = os.path.join(dir_html, f)
with open(path, 'r') as inpf:
text = inpf.read()
soup = BeautifulSoup(text, 'html.parser')
# check
flag = check_soup(soup)
if not flag:
errors.append(idx)

return errors



#%% Main functions

def get_arguments() -> argparse.Namespace:
'''CLI wrapper
Returns:
argparse.Namespace: CLI arguments
'''
parser = argparse.ArgumentParser(description = 'Runs compound initialization from pre-loaded HTML-pages')
parser.add_argument('dir_data',
help = 'directory containing compounds.csv and htmls/')
args = parser.parse_args()

return args


def check_arguments(args: argparse.Namespace) -> None:
'''Tries to create dir_data if it does not exist and raizes error if dir_data is a file
Arguments:
args (argparse.Namespace): input parameters
'''
# check root dir
if not os.path.exists(args.dir_data):
raise ValueError(f'Given dir_data argument does not exist: {args.dir_data}')
if not os.path.isdir(args.dir_data):
raise ValueError(f'Given dir_data argument is not a directory: {args.dir_data}')
# check compounds.csv
path_csv = os.path.join(args.dir_data, 'compounds.csv')
if not os.path.exists(path_csv):
raise ValueError('Given dir_data directory does not contain compounds.csv file')

return


def main() -> None:
'''Runs compound initialization'''
print('Preparing data ...')
# prepare arguments
args = get_arguments()
check_arguments(args)
dir_html = os.path.join(args.dir_data, 'htmls/')
path_csv = os.path.join(args.dir_data, 'compounds.csv')
df = pd.read_csv(path_csv)
# process compounds
print('Running compound initialization ...')
idxs = get_unreadable_compounds(dir_html)
sub = df.loc[df.index.isin(idxs)]
# save
print('\nSaving data ...')
path_out = os.path.join(args.dir_data, 'unreadable.csv')
sub.to_csv(path_out)

return



#%% Main

if __name__ == '__main__':

main()


0 comments on commit 4fafe14

Please sign in to comment.