diff --git a/README.md b/README.md index 30f39f6..e4f4ee7 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,19 @@ $ cd spacekit $ pip install -e . ``` +*Testing* + +See `tox.ini` for a list of test suite markers. + +```bash +# run all tests +$ pytest + +# some tests, like the `scan` module rely on the test `env` option +$ pytest --env svm -m scan +$ pytest --env cal -m scan +``` + ### Pre-Trained Neural Nets diff --git a/conftest.py b/conftest.py index c984218..6e1e5f1 100644 --- a/conftest.py +++ b/conftest.py @@ -5,6 +5,7 @@ from spacekit.analyzer.explore import HstCalPlots, HstSvmPlots from spacekit.analyzer.scan import SvmScanner, CalScanner, import_dataset from spacekit.extractor.load import load_datasets + # try: # from pytest_astropy_header.display import (PYTEST_HEADER_MODULES, # TESTED_VERSIONS) @@ -50,7 +51,9 @@ def __init__(self, env): "cal": os.path.join(f"tests/data/{env}/data.zip"), }[env] - self.kwargs = {"svm": dict(index_col="index"), "cal": dict(index_col="ipst")}[env] + self.kwargs = {"svm": dict(index_col="index"), "cal": dict(index_col="ipst")}[ + env + ] self.decoder = { "svm": {"det": {0: "hrc", 1: "ir", 2: "sbc", 3: "uvis", 4: "wfc"}}, @@ -68,17 +71,30 @@ def __init__(self, env): }[env] self.norm_cols = { - "svm": ["numexp", "rms_ra", "rms_dec", "nmatches", "point", "segment", "gaia"], + "svm": [ + "numexp", + "rms_ra", + "rms_dec", + "nmatches", + "point", + "segment", + "gaia", + ], "cal": ["n_files", "total_mb"], }[env] - self.rename_cols = { - "svm": "_scl", - "cal": ["x_files", "x_size"] - }[env] + self.rename_cols = {"svm": "_scl", "cal": ["x_files", "x_size"]}[env] self.enc_cols = { "svm": ["det", "wcs", "cat"], - "cal": ["drizcorr", "pctecorr", "crsplit", "subarray", "detector", "dtype", "instr"] + "cal": [ + "drizcorr", + "pctecorr", + "crsplit", + "subarray", + "detector", + "dtype", + "instr", + ], }[env] self.tx_file = { @@ -112,8 +128,8 @@ def res_data_path(cfg, tmp_path_factory): data_path = os.path.join(basepath, dname) return data_path - -@fixture(scope='session') + +@fixture(scope="session") def df_ncols(cfg): fname = cfg.labeled X_cols = cfg.norm_cols + cfg.enc_cols @@ -135,9 +151,7 @@ def scanner(cfg, res_data_path): @fixture(scope="session") def explorer(cfg, res_data_path): fname = res_data_path - df = import_dataset( - filename=fname, kwargs=cfg.kwargs, decoder=cfg.decoder - ) + df = import_dataset(filename=fname, kwargs=cfg.kwargs, decoder=cfg.decoder) if cfg.env == "svm": hst = HstSvmPlots(df) elif cfg.env == "cal": @@ -244,4 +258,4 @@ def scraped_mast_file(): # CAL @fixture(scope="function") def cal_labeled_dataset(): - return "tests/data/cal/train/training.csv" \ No newline at end of file + return "tests/data/cal/train/training.csv" diff --git a/docs/source/conf.py b/docs/source/conf.py index 46bb5e1..2f3d470 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -12,28 +12,31 @@ # import os import sys -#import sphinx + +# import sphinx import datetime import os from configparser import ConfigParser -#import stsci_rtd_theme + +# import stsci_rtd_theme # -- Project information ----------------------------------------------------- # General information about the project -project = u'spacekit' -author = u'Ru Kein' +project = "spacekit" +author = "Ru Kein" year = datetime.datetime.now().year -copyright = f'{year}, {author}' +copyright = f"{year}, {author}" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # build documents. # The full version, including alpha/beta/rc tags. -release = '0.2.7' -#release = get_distribution(project).version +release = "0.2.7" +# release = get_distribution(project).version # The short X.Y version. -version = '.'.join(release.split('.')[:2]) +version = ".".join(release.split(".")[:2]) + def setup(app): app.add_css_file("stsci.css") @@ -41,17 +44,17 @@ def setup(app): # -- General configuration ------------------------------------------------ conf = ConfigParser() -conf.read([os.path.join(os.path.dirname(__file__), '..', 'setup.cfg')]) +conf.read([os.path.join(os.path.dirname(__file__), "..", "setup.cfg")]) # setup_cfg = dict(conf.items('metadata')) # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../')) -sys.path.insert(0, os.path.abspath('../../')) +sys.path.insert(0, os.path.abspath("../")) +sys.path.insert(0, os.path.abspath("../../")) -#on_rtd = os.environ.get('READTHEDOCS', None) == 'True' +# on_rtd = os.environ.get('READTHEDOCS', None) == 'True' # Configuration for intersphinx: refer to the Python standard library. # Uncomment if you cross-ref to API doc from other packages. @@ -66,46 +69,46 @@ def setup(app): # (None, 'http://data.astropy.org/intersphinx/matplotlib.inv')), # noqa # 'astropy': ('https://docs.astropy.org/en/stable/', None)} intersphinx_mapping = { - 'python': ('http://docs.python.org/3/', None), - 'numpy': ('http://docs.scipy.org/doc/numpy/', None), - 'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None), - 'matplotlib': ('http://matplotlib.org/', None), - 'astropy': ('http://docs.astropy.org/en/stable/', None), + "python": ("http://docs.python.org/3/", None), + "numpy": ("http://docs.scipy.org/doc/numpy/", None), + "scipy": ("http://docs.scipy.org/doc/scipy/reference/", None), + "matplotlib": ("http://matplotlib.org/", None), + "astropy": ("http://docs.astropy.org/en/stable/", None), } # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.inheritance_diagram', - 'sphinx.ext.viewcode', - 'sphinx.ext.napoleon', - 'sphinx.ext.autosummary', - 'sphinx.ext.doctest', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'numpydoc', - 'sphinx_automodapi.automodapi', - 'sphinx_automodapi.automodsumm', - 'sphinx_automodapi.autodoc_enhancements', - 'sphinx_automodapi.smart_resolver', + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.inheritance_diagram", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", + "sphinx.ext.autosummary", + "sphinx.ext.doctest", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "numpydoc", + "sphinx_automodapi.automodapi", + "sphinx_automodapi.automodsumm", + "sphinx_automodapi.autodoc_enhancements", + "sphinx_automodapi.smart_resolver", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -119,11 +122,11 @@ def setup(app): # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. -default_role = 'obj' +default_role = "obj" # Don't show summaries of the members in each class along with the # class' docstring @@ -131,7 +134,7 @@ def setup(app): autosummary_generate = True -automodapi_toctreedirnm = 'api' +automodapi_toctreedirnm = "api" # Class documentation should contain *both* the class docstring and # the __init__ docstring @@ -141,12 +144,12 @@ def setup(app): graphviz_output_format = "svg" graphviz_dot_args = [ - '-Nfontsize=10', - '-Nfontname=Helvetica Neue, Helvetica, Arial, sans-serif', - '-Efontsize=10', - '-Efontname=Helvetica Neue, Helvetica, Arial, sans-serif', - '-Gfontsize=10', - '-Gfontname=Helvetica Neue, Helvetica, Arial, sans-serif' + "-Nfontsize=10", + "-Nfontname=Helvetica Neue, Helvetica, Arial, sans-serif", + "-Efontsize=10", + "-Efontname=Helvetica Neue, Helvetica, Arial, sans-serif", + "-Gfontsize=10", + "-Gfontname=Helvetica Neue, Helvetica, Arial, sans-serif", ] # If true, '()' will be appended to :func: etc. cross-reference text. @@ -161,7 +164,7 @@ def setup(app): # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -176,7 +179,7 @@ def setup(app): # a list of builtin themes. # html_theme = 'stsci_rtd_theme' # html_theme_path = [stsci_rtd_theme.get_html_theme_path()] -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". @@ -194,18 +197,18 @@ def setup(app): # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. # html_extra_path = [] -html_static_path = ['_static'] +html_static_path = ["_static"] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -html_last_updated_fmt = '%b %d, %Y' +html_last_updated_fmt = "%b %d, %Y" # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. # html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -html_sidebars = {'**': ['globaltoc.html', 'relations.html', 'searchbox.html']} +html_sidebars = {"**": ["globaltoc.html", "relations.html", "searchbox.html"]} # Additional templates that should be rendered to pages, maps page names to # template names. @@ -238,25 +241,24 @@ def setup(app): # html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = f'{project}doc' +htmlhelp_basename = f"{project}doc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). - 'papersize': 'letterpaper', + "papersize": "letterpaper", # The font size ('10pt', '11pt' or '12pt'). - 'pointsize': '14pt', + "pointsize": "14pt", # Additional stuff for the LaTeX preamble. - 'preamble': r'''\usepackage{enumitem} \setlistdepth{99}''' + "preamble": r"""\usepackage{enumitem} \setlistdepth{99}""", } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - ('index', f'{project}.tex', f'{project} Documentation', - f'{project}', 'manual'), + ("index", f"{project}.tex", f"{project} Documentation", f"{project}", "manual"), ] # The name of an image file (relative to this directory) to place at the top of @@ -270,7 +272,7 @@ def setup(app): # latex_show_pagerefs = False # If true, show URL addresses after external links. -latex_show_urls = 'True' +latex_show_urls = "True" # Documents to append as an appendix to all manuals. # latex_appendices = [] @@ -282,10 +284,7 @@ def setup(app): # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', f'{project}', f'{project} Documentation', - [f'{project}'], 1) -] +man_pages = [("index", f"{project}", f"{project} Documentation", [f"{project}"], 1)] # If true, show URL addresses after external links. man_show_urls = True @@ -296,9 +295,15 @@ def setup(app): # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', f'{project}', f'{project} Documentation', - f'{author}', f'{project}', f'{project}', - 'Miscellaneous'), + ( + "index", + f"{project}", + f"{project} Documentation", + f"{author}", + f"{project}", + f"{project}", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. @@ -308,7 +313,7 @@ def setup(app): texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -texinfo_show_urls = 'inline' +texinfo_show_urls = "inline" # If true, do not generate a @detailmenu in the "Top" node's menu. # texinfo_no_detailmenu = False @@ -316,10 +321,10 @@ def setup(app): # -- Options for Epub output ---------------------------------------------- # Bibliographic Dublin Core info. -epub_title = f'{project}' -epub_author = f'{author}' -epub_publisher = f'{author}' -epub_copyright = f'{year} {author}' +epub_title = f"{project}" +epub_author = f"{author}" +epub_publisher = f"{author}" +epub_copyright = f"{year} {author}" # The basename for the epub file. It defaults to the project name. # epub_basename = u'wfc3tools' @@ -328,7 +333,7 @@ def setup(app): # optimized for small screen space, using the same theme for HTML and # epub output is usually not wise. This defaults to 'epub', a theme designed # to save visual space. -epub_theme = 'epub' +epub_theme = "epub" # The language of the text. It defaults to the language option # or en if the language is not set. @@ -359,7 +364,7 @@ def setup(app): # epub_post_files = [] # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # The depth of the table of contents in toc.ncx. # epub_tocdepth = 3 @@ -380,4 +385,4 @@ def setup(app): # epub_show_urls = 'inline' # If false, no index is generated. -# epub_use_index = True \ No newline at end of file +# epub_use_index = True diff --git a/pyproject.toml b/pyproject.toml index de38f14..374b58c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,5 +4,3 @@ requires = [ "wheel" ] build-backend = "setuptools.build_meta" -[tool.setuptools] -write_to = "spacekit/_version.py" \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 6aa0c91..a7c9d48 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,6 @@ [metadata] name = spacekit +version = 0.3.0 author = Ru Keïn author_email = rkein@stsci.edu license = MIT diff --git a/setup.py b/setup.py index bb7dcf2..420f5f3 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ """ -if 'test' in sys.argv: +if "test" in sys.argv: print(TEST_HELP) sys.exit(1) @@ -29,10 +29,10 @@ """ -if 'build_docs' in sys.argv or 'build_sphinx' in sys.argv: +if "build_docs" in sys.argv or "build_sphinx" in sys.argv: print(DOCS_HELP) sys.exit(1) # setup(use_scm_version={'write_to': 'spacekit/_version.py'}) -setup(version='0.3.0') +setup(version="0.3.0") diff --git a/spacekit/datasets/hst_cal.py b/spacekit/datasets/hst_cal.py new file mode 100644 index 0000000..1fc7979 --- /dev/null +++ b/spacekit/datasets/hst_cal.py @@ -0,0 +1,59 @@ +from spacekit.extractor.scrape import WebScraper +from spacekit.analyzer.scan import import_dataset + +calcloud_uri = "https://raw.githubusercontent.com/alphasentaurii/spacekit/main/spacekit/datasets/hst/calcloud" + +calcloud_data = { + "2021-11-04": { + "fname": "2021-11-04-1636048291.zip", + "hash": "53bbf7486c4754b60b2f8ff3898ddb3bb6f744c9", + }, + "2021-10-28": { + "fname": "2021-10-28-1635457222.zip", + "hash": "96222742bee071bfa32a403720e6ae8a53e66f56", + }, + "2021-08-22": { + "fname": "2021-08-22-1629663047.zip", + "hash": "ef872adc24ae172d9ccc8a74565ad81104dee2c0", + }, +} + +cal_model_archive = { + "model_path": "calmodels.zip", + "hash": "b36e310874f7cd2c50d4e5c04438118af7751c69", +} + + +def download_single_archive(date_key=None): + if date_key is None: + # default to most recent + date_key = sorted(list(calcloud_data.keys()))[-1] + # limit data download to single archive + dataset = {date_key: calcloud_data[date_key]} + scraper = WebScraper(calcloud_uri, dataset) + scraper.scrape() + fpath = scraper.fpaths[0] + print(fpath) + return fpath + + +def download_archives(): + """Download zip archives of calcloud hst training data iterations (including datasets, models, and results). + + Returns + ------- + list + list of paths to extracted dataset archives + """ + return WebScraper(calcloud_uri, calcloud_data).scrape() + + +def load_data(fpath=None, date_key=None): + if fpath is None: + fpath = download_single_archive(date_key=date_key) + df = import_dataset( + filename=fpath, + kwargs=dict(index_col="ipst"), + decoder_key={"instr": {0: "acs", 1: "cos", 2: "stis", 3: "wfc3"}}, + ) + return df diff --git a/spacekit/extractor/scrape.py b/spacekit/extractor/scrape.py index 0f8e667..e466fed 100644 --- a/spacekit/extractor/scrape.py +++ b/spacekit/extractor/scrape.py @@ -734,16 +734,15 @@ class JsonScraper: specified, the current working directory will be used. search_patterns : list, optional list of glob patterns to use for search - log_level : int, optional - The desired level of verboseness in the log statements displayed on the screen and written to the - .log file. Default value is 'INFO'. file_basename : str, optional - Name of the output file basename (filename without the extension) for the Hierarchical Data Format - version 5 (HDF5) .h5 file that the Pandas DataFrame will be written to. If not explicitly specified, - the default filename basename that will be used is "svm_qa_dataframe". The default location that the + Name of the output file basename (filename without the extension) for the Hierarchical Data Format version 5 (HDF5) .h5 file that the Pandas DataFrame will be written to. If not explicitly specified, the default filename basename that will be used is "svm_qa_dataframe". The default location that the output file will be written to is the current working directory crpt: bool, optional - Uses extended dataframe index name to differentiate from normal svm data + Uses extended dataframe index name to differentiate from normal svm data, by default 0 (False) + save_csv: bool, optional + store h5 data into a CSV file, by default False + h5_file: str, optional + load from a saved hdf5 file on local disk, by default None data : Pandas DataFrame Pandas DataFrame """ @@ -756,6 +755,7 @@ def __init__( crpt=0, save_csv=False, store_h5=True, + h5_file=None, output_path=None, ): self.search_path = search_path @@ -764,6 +764,7 @@ def __init__( self.crpt = crpt self.save_csv = save_csv self.store_h5 = store_h5 + self.h5_file = h5_file self.output_path = output_path self.__name__ = "diagnostic_json_harvester" self.msg_datefmt = "%Y%j%H%M%S" diff --git a/spacekit/preprocessor/encode.py b/spacekit/preprocessor/encode.py index 3929f3e..33f1b69 100644 --- a/spacekit/preprocessor/encode.py +++ b/spacekit/preprocessor/encode.py @@ -1,6 +1,7 @@ import pandas as pd from sklearn.preprocessing import LabelEncoder from tensorflow.keras.utils import to_categorical +import numpy as np def encode_target_data(y_train, y_test): @@ -32,30 +33,156 @@ def encode_target_data(y_train, y_test): return y_train, y_test -# class Encoder: -# def __init__(self, df): -# self.df = df +class PairEncoder: + def __init__(self): + self.arr = None + self.transformed = None + self.invpairs = None + self.inversed = None + + def lambda_func(self, inverse=False): + if inverse is False: + L = lambda x: self.keypairs[x] + return [L(a) for a in self.arr] + else: + self.inverse_pairs() + I = lambda i: self.invpairs[i] + return [I(b) for b in self.transformed] + + def inverse_pairs(self): + self.invpairs = {} + for key, value in self.keypairs.items(): + self.invpairs[value] = key + return self.invpairs + + def warn_unknowns(self): + unknowns = np.where([a not in self.classes_ for a in self.arr]) + print(f"WARNING: Found unknown values:\n {self.arr[unknowns]}") + + def handle_unknowns(self): + unknowns = np.where([a not in self.classes_ for a in self.arr]) + add_encoding = max(list(self.keypairs.values())) + 1 + try: + # TODO handle multiple different unknowns + self.keypairs[self.arr[unknowns][0]] = add_encoding + self.classes_ = list(self.keypairs.keys()) + print("Successfully added encoding for unknown values.") + except Exception as e: + print("Error: unable to add encoding for unknown value(s)") + print(e) + + def fit(self, data, keypairs, axiscol=None, handle_unknowns=True): + if isinstance(data, pd.DataFrame): + if axiscol is None: + print( + "Error: Must indicate which column to fit if `data` is a `dataframe`." + ) + return + try: + self.arr = np.asarray(data[axiscol], dtype=object) + except Exception as e: + print(e) + elif isinstance(data, np.ndarray): + if len(data.shape) > 1: + if data.shape[-1] > 1: + if axiscol is None: + print("Error - must specify index using `axiscol`") + return + else: + self.arr = np.asarray(data[:, axiscol], dtype=object) + else: + self.arr = np.asarray(data, dtype=object) + else: + print("Invalid Type: `data` must be of an array or dataframe.") + return + self.keypairs = keypairs + self.classes_ = list(self.keypairs.keys()) + if self.arr.any() not in self.classes_: + # if self.arr.any() not in self.classes_: + self.warn_unknowns() + if handle_unknowns is True: + self.handle_unknowns() + else: + return + try: + self.unique = np.unique(self.arr) + except Exception as e: + print(e) + return self + + def transform(self): + if self.arr is None: + print("Error - Must fit the data first.") + return + self.transformed = self.lambda_func() + return self.transformed + + def inverse_transform(self): + inverse_pairs = {} + for key, value in self.keypairs.items(): + inverse_pairs[value] = key + # TODO handle unknowns/nans inversely + self.inversed = self.lambda_func(inverse=True) + return self.inversed + + def fit_transform(self, data, keypairs, axiscol=None): + self.fit(data, keypairs, axiscol=axiscol) + self.transform() class SvmEncoder: """Categorical encoding class for HST Single Visit Mosiac regression test data inputs.""" - def __init__(self, data): + def __init__( + self, + data, + fkeys=["category", "detector", "wcstype"], + names=["cat", "det", "wcs"], + ): """Instantiates an SvmEncoder class object. Parameters ---------- data : dataframe input data containing features (columns) to be encoded + + fkeys: list + categorical-type column names (str) to be encoded + + names: list + new names to assign columns of the encoded versions of categorical data + """ self.data = data - self.df = data.copy() - self.sep = ";" - self.encodings = {"wcstype": "wcs", "cat": "cat", "detector": "det"} + self.fkeys = fkeys + self.names = names + self.df = self.categorical_data() + self.make_keypairs() + + def __repr__(self): + return ( + "encodings: %s \n category_keys: %s \n detector_keys: %s \n wcs_keys: %s" + % (self.encodings, self.category_keys, self.detector_keys, self.wcs_keys) + ) + + def categorical_data(self): + """Makes a copy of input dataframe and extracts only the categorical features based on the column names in `fkeys`. + + Returns + ------- + df: dataframe + dataframe with only the categorical feature columns + """ + return self.data.copy()[self.fkeys] + + def make_keypairs(self): + """Instantiates key-pair dictionaries for each of the categorical features.""" + self.encodings = dict(zip(self.fkeys, self.names)) self.category_keys = self.set_category_keys() - self.df_cat = self.encode_categories() + self.detector_keys = self.set_detector_keys() + self.wcs_keys = self.set_wcs_keys() - def set_category_keys(self): + def init_categories(self): """Assigns abbreviated character code as key-pair value for each type of target category classification (as determined by data on MAST archive). Returns @@ -63,7 +190,7 @@ def set_category_keys(self): dict key-pair values for image target category classification. """ - self.category_keys = { + return { "CALIBRATION": "C", "SOLAR SYSTEM": "SS", "ISM": "I", @@ -75,29 +202,88 @@ def set_category_keys(self): "EXT-STAR": "S", "CLUSTER OF GALAXIES": "GC", "GALAXY": "G", + "None": "U", + } + + def set_category_keys(self): + """Assigns an integer for each abbreviated character for target category classifications (as determined by data on MAST archive). Note - this could have been directly on the raw inputs from MAST, but the extra step is done to allow for debugging and analysis purposes (value-count of abbreviated versions are printed to stdout before the final encoding). + + Returns + ------- + dict + key-pair values for image target category classification. + """ + self.category_keys = { + "C": 0, + "SS": 1, + "I": 2, + "U": 3, + "SC": 4, + "S": 5, + "GC": 6, + "G": 7, } return self.category_keys - def encode_categories(self): - """Encodes the target categories of a dataframe as integer (numeric) datatype, which is required for machine learning inputs. + def set_detector_keys(self): + """Assigns a hardcoded integer to each 'detector' key in alphabetical and increasing value. + + Returns + ------- + dict + detector names and their associated integer encoding + """ + self.detector_keys = {"hrc": 0, "ir": 1, "sbc": 2, "uvis": 3, "wfc": 4} + return self.detector_keys + + def set_wcs_keys(self): + """Assigns a hardcoded integer to each 'wcs' key in alphabetical and increasing value. + + Returns + ------- + _type_ + _description_ + """ + self.wcs_keys = { + "a posteriori": 0, + "a priori": 1, + "default a": 2, + "not aligned": 3, + } + return self.wcs_keys + + def svm_keypairs(self, column): + keypairs = { + "category": self.category_keys, + "detector": self.detector_keys, + "wcstype": self.wcs_keys, + } + return keypairs[column] + + def encode_categories(self, cname="category", sep=";"): + """Transforms the raw string inputs from MAST target category naming conventions into an abbreviated form. For example, `CLUSTER OF GALAXIES;GRAVITATIONA` becomes `GC` for galaxy cluster; and `STELLAR CLUSTER;GLOBULAR CLUSTER` becomes `SC` for stellar cluster. This serves to group similar but differently named objects into a discrete set of 8 possible categorizations. The 8 categories will then be encoded into integer values in the final encoding step (machine learning inputs must be numeric). Returns ------- dataframe original dataframe with category input feature values encoded. """ - print("\n*** Encoding Category Names ***") CAT = {} - for idx, cat in self.df.category.items(): - c = cat.split(self.sep)[0] - if c in self.category_keys: - CAT[idx] = self.category_keys[c] - self.df_cat = pd.DataFrame.from_dict(CAT, orient="index", columns={"cat"}) - print("\nCategory encoding complete.") - print(self.df_cat["cat"].value_counts()) - self.df = self.df.join(self.df_cat, how="left") + ckeys = self.init_categories() + for idx, cat in self.df[cname].items(): + c = cat.split(sep)[0] + if c in ckeys: + CAT[idx] = ckeys[c] + df_cat = pd.DataFrame.from_dict(CAT, orient="index", columns={"category"}) + self.df.drop("category", axis=1, inplace=True) + self.df = self.df.join(df_cat, how="left") return self.df + def rejoin_original(self): + originals = list(self.encodings.keys()) + self.df.drop(originals, axis=1, inplace=True) + self.df = self.data.join(self.df, how="left") + def encode_features(self): """Encodes input features matching column names assigned to the object's ``encodings`` attribute. @@ -106,7 +292,16 @@ def encode_features(self): dataframe original dataframe with all categorical type features label-encoded. """ + self.encode_categories() + print("\n\nENCODING CATEGORICAL FEATURES") for col, name in self.encodings.items(): - encoder = LabelEncoder().fit(self.df[col]) - self.df[name] = encoder.transform(self.df[col]) + keypairs = self.svm_keypairs(col) + enc = PairEncoder() + enc.fit_transform(self.df, keypairs, axiscol=col) + self.df[name] = enc.transformed + print(f"\n*** {col} --> {name} ***") + print( + f"ORIGINAL:\n{self.df[col].value_counts()}\n\nENCODED:\n{self.df[name].value_counts()}\n" + ) + self.rejoin_original() return self.df diff --git a/spacekit/preprocessor/scrub.py b/spacekit/preprocessor/scrub.py index a010a2e..3e0422b 100644 --- a/spacekit/preprocessor/scrub.py +++ b/spacekit/preprocessor/scrub.py @@ -19,6 +19,7 @@ def __init__( save_raw=True, ): self.df = self.cache_data(cache=data) + self.col_order = col_order self.output_path = output_path self.output_file = output_file self.dropnans = dropnans @@ -60,7 +61,8 @@ def drop_nans(self): if self.dropnans is True: print("Searching for NaNs...") print(self.df.isna().sum()) - print("Dropping NaNs") + if self.df.isna().sum().values.any() > 0: + print("Dropping NaNs") self.df.dropna(axis=0, inplace=True) def drop_and_set_cols(self, label_cols=["label"]): @@ -96,7 +98,29 @@ def save_csv_file(self, pfx="", index_col="index"): class SvmScrubber(Scrubber): - """Class for invocating standard preprocessing steps of Single Visit Mosaic regression test data. This class quietly relies on other classes in the module to instantiate other scrubbing objects, although they are distinct and non-hierarchical (no inheritance between them).""" + """Class for invocating standard preprocessing steps of Single Visit Mosaic regression test data. + + Parameters + ---------- + input_path : str or Path + path to directory containing data input files + data : dataframe, optional + dataframe containing raw inputs scraped from json (QA) files, by default None + output_path : str or Path, optional + location to save preprocessed output files, by default None + output_file : str, optional + file basename to assign preprocessed dataset, by default "svm_data" + dropnans : bool, optional + find and remove any NaNs, by default True + save_raw : bool, optional + save data as csv before any encoding is performed, by default True + make_pos_list : bool, optional + create a text file listing misaligned (label=1) datasets, by default True + crpt : int, optional + dataset contains synthetically corrupted data, by default 0 + make_subsamples : bool, optional + save a random selection of aligned (label=0) datasets to text file, by default False + """ def __init__( self, @@ -133,10 +157,10 @@ def preprocess_data(self): # STAGE 2 initial encoding self.df = FitsScraper(self.df, self.input_path).scrape_fits() self.df = MastScraper(self.df).scrape_mast() - if self.save_raw is True: - super().save_csv_file(pfx="raw_") # STAGE 3 final encoding self.df = SvmEncoder(self.df).encode_features() + if self.save_raw is True: + super().save_csv_file(pfx="raw_") super().drop_and_set_cols() # STAGE 4 target labels self.make_pos_label_list() diff --git a/spacekit/skopes/hst/svm/predict.py b/spacekit/skopes/hst/svm/predict.py index 39fc567..1bd85d1 100644 --- a/spacekit/skopes/hst/svm/predict.py +++ b/spacekit/skopes/hst/svm/predict.py @@ -29,6 +29,8 @@ TF_CPP_MIN_LOG_LEVEL = 2 +DETECTOR_KEY = {"hrc": 0, "ir": 1, "sbc": 2, "uvis": 3, "wfc": 4} + def load_mixed_inputs(data_file, img_path, tx=None, size=128, norm=0): """Load the regression test data and image input data, then stacks the arrays into a single combined input (list) for the ensemble model. @@ -80,7 +82,7 @@ def load_mixed_inputs(data_file, img_path, tx=None, size=128, norm=0): return [X_data, X_img] -def classification_report(df, output_path): +def classification_report(df, output_path, group=None): """Generates a scikit learn classification report with model evaluation metrics and saves to disk. Parameters @@ -89,36 +91,50 @@ def classification_report(df, output_path): Feature inputs for which the model will generate predictions. output_path : str location to store prediction output files + group: str, optional + Name for this group of data (for classification report), e.g. SVM-2021-11-02 """ P, T = df["y_pred"], df["det"].value_counts() C = df.loc[P == 1.0] cmp = C["det"].value_counts() - dets = ["HRC", "IR", "SBC", "UVIS", "WFC"] - separator = "---" * 5 + separator = "---" * 7 + date, time = dt.datetime.now().isoformat().split(".")[0].split("T") + if group is None: + group = "SVM-DATA" out = sys.stdout with open(f"{output_path}/clf_report.txt", "w") as f: sys.stdout = f - print("CLASSIFICATION REPORT - ", dt.datetime.now()) + print("CLASSIFICATION REPORT") + print("date: ", date) + print("time: ", time) + print("data: ", group) print(separator) - print("Mean Probability Score: ", df["y_proba"].mean()) - print("Standard Deviation: ", df["y_proba"].std()) + print("Mean Probability Score: ", np.round(df["y_proba"].mean(), 4)) + print("Standard Deviation: ", np.round(df["y_proba"].std(), 4)) print(separator) - print("Aligned ('0.0') vs Misaligned ('1.0')") + print("Alignment Evaluation") + print("0.0=aligned, 1.0=suspicious") cnt_pct = pd.concat( - [P.value_counts(), P.value_counts(normalize=True)], + [ + P.value_counts(), + P.value_counts(normalize=True), + ], axis=1, keys=["cnt", "pct"], ) print(cnt_pct) print(separator) print("Misalignment counts by Detector") - for i, d in enumerate(dets): + for d, i in DETECTOR_KEY.items(): if i in cmp: + # some alignments from this detector were suspicious print(f"{d}\t{cmp[i]} \t ({T[i]}) \t {np.round((cmp[i]/T[i])*100, 1)}%") elif i in T: + # no alignments from this detector were suspicious print(f"{d}\t0 \t ({T[i]}) \t 0%") else: - print(f"{d}\t0 \t (0) \t 0%") + # no samples from this detector in dataset + print(f"{d}\t0 \t (0) \t -") sys.stdout = out print(f"\nClassification Report created: {output_path}/clf_report.txt") with open(f"{output_path}/compromised.txt", "w") as file: @@ -127,7 +143,7 @@ def classification_report(df, output_path): print(f"\nSuspicious/Compromised List created: {output_path}/compromised.txt") -def classify_alignments(X, model, output_path=None): +def classify_alignments(X, model, output_path=None, group=None): """Returns classifier predictions and probability scores Parameters @@ -138,6 +154,8 @@ def classify_alignments(X, model, output_path=None): saved model directory path, by default None output_path : str location to store prediction output files + group: str, optional + Name for this group of data (for classification report), e.g. SVM-2021-11-02 Returns ------- @@ -158,12 +176,18 @@ def classify_alignments(X, model, output_path=None): output_file = f"{output_path}/predictions.csv" preds.to_csv(output_file, index=False) print("Y_PRED + Probabilities added. Dataframe saved to: ", output_file) - classification_report(preds, output_path) + classification_report(preds, output_path, group=group) return preds def predict_alignment( - data_file, img_path, model_path=None, output_path=None, size=128, norm=0 + data_file, + img_path, + model_path=None, + output_path=None, + size=128, + norm=0, + group=None, ): """Main calling function to load the data and model, generate predictions, and save results to disk. @@ -179,12 +203,14 @@ def predict_alignment( location to store prediction output files, by default None size : int, optional image size (width and height), by default None (128) + group: str, optional + Name for this group of data (for classification report), e.g. SVM-2021-11-02 """ builder = Builder(model_path=model_path) model = builder.load_saved_model() tx_file = builder.find_tx_file() X = load_mixed_inputs(data_file, img_path, tx=tx_file, size=size, norm=norm) - preds = classify_alignments(X, model, output_path=output_path) + preds = classify_alignments(X, model, output_path=output_path, group=group) return preds @@ -230,6 +256,13 @@ def predict_alignment( default=0, help="apply normalization and scaling", ) + parser.add_argument( + "-g", + "--group", + type=str, + default=None, + help="Name for this group of data (to be included in classification report)", + ) args = parser.parse_args() _ = predict_alignment( args.data_file, @@ -238,4 +271,5 @@ def predict_alignment( output_path=args.output_path, size=args.size, norm=args.normalization, + group=args.group, ) diff --git a/spacekit/skopes/hst/svm/prep.py b/spacekit/skopes/hst/svm/prep.py index d394b62..e56f74f 100644 --- a/spacekit/skopes/hst/svm/prep.py +++ b/spacekit/skopes/hst/svm/prep.py @@ -40,7 +40,7 @@ def run_preprocessing( fname : str, optional base filename to give the output files, by default "svm_data" output_path : str, optional - where to save output files. Defaults to current working directory., by default None + where to save output files. Defaults to current working directory, by default None json_pattern : str, optional glob-based search pattern, by default "*_total*_svm_*.json" visit: str, optional diff --git a/tests/preprocessor/test_encode.py b/tests/preprocessor/test_encode.py index 3a3c387..7e4d747 100644 --- a/tests/preprocessor/test_encode.py +++ b/tests/preprocessor/test_encode.py @@ -1,6 +1,7 @@ from pytest import mark -from spacekit.preprocessor.encode import SvmEncoder +from spacekit.preprocessor.encode import PairEncoder, SvmEncoder import pandas as pd +from numpy import asarray ENCODED_COL_EXPECTED = [ "detector", @@ -37,3 +38,141 @@ def test_svm_encoder(scraped_mast_file): assert True else: assert False + assert enc.df.cat[0] == 3 + assert enc.df.det[0] == 1 + assert enc.df.wcs[0] == 0 + + +@mark.svm +@mark.preprocessor +@mark.encode +def test_pair_encoder_unknown_val(scraped_mast_file): + data = pd.read_csv(scraped_mast_file, index_col="index") + data.loc["hst_12286_38_wfc3_ir_total_ibl738", "wcstype"] = "NaN" + keypairs = {"a posteriori": 0, "a priori": 1, "default a": 2, "not aligned": 3} + enc = PairEncoder() + enc.fit(data, keypairs, axiscol="wcstype", handle_unknowns=False) + try: + enc.transform() + except KeyError: + assert True + + +@mark.svm +@mark.preprocessor +@mark.encode +def test_svm_encoder_handle_unknown(scraped_mast_file): + data = pd.read_csv(scraped_mast_file, index_col="index") + data.loc["hst_12286_38_wfc3_ir_total_ibl738", "wcstype"] = "NaN" + keypairs = {"a posteriori": 0, "a priori": 1, "default a": 2, "not aligned": 3} + enc = PairEncoder() + enc.fit(data, keypairs, axiscol="wcstype", handle_unknowns=True) + enc.transform() + assert enc.transformed[0] == 4 + + +@mark.svm +@mark.preprocessor +@mark.encode +def test_pair_encoder_unspecified_column(scraped_mast_file): + data = pd.read_csv(scraped_mast_file, index_col="index") + keypairs = {"a posteriori": 0, "a priori": 1, "default a": 2, "not aligned": 3} + enc = PairEncoder() + enc.fit(data, keypairs) + assert enc.arr is None + try: + enc.keypairs + except AttributeError: + assert True + + +@mark.svm +@mark.preprocessor +@mark.encode +def test_pair_encoder_array_1d(): + keypairs = {"a posteriori": 0, "a priori": 1, "default a": 2, "not aligned": 3} + arr1d = asarray(["a priori"], dtype=object) + enc = PairEncoder() + enc.fit(arr1d, keypairs) + enc.transform() + assert enc.transformed[0] == 1 + + +@mark.svm +@mark.preprocessor +@mark.encode +def test_pair_encoder_array_2d(): + keypairs = {"a posteriori": 0, "a priori": 1, "default a": 2, "not aligned": 3} + arr2d = asarray( + [ + [ + "ir", + "ibl738", + "ANY", + 262.46, + 52.32, + 2, + "myfile.fits", + 284, + 185, + 7, + 5.87, + 13.38, + 5, + "default a", + "UNIDENTIFIED;PARALLEL FIELD", + ] + ] + ) + enc = PairEncoder() + enc.fit(arr2d, keypairs, axiscol=13) + enc.transform() + assert enc.transformed[0] == 2 + + +@mark.svm +@mark.preprocessor +@mark.encode +def test_pair_encoder_array_2d_unspecified_axis(): + keypairs = {"a posteriori": 0, "a priori": 1, "default a": 2, "not aligned": 3} + arr2d = asarray( + [ + [ + "ir", + "ibl738", + "ANY", + 262.46, + 52.32, + 2, + "myfile.fits", + 284, + 185, + 7, + 5.87, + 13.38, + 5, + "default a", + "UNIDENTIFIED;PARALLEL FIELD", + ] + ] + ) + enc = PairEncoder() + enc.fit(arr2d, keypairs) + try: + enc.classes_ + except AttributeError: + assert True + + +@mark.svm +@mark.preprocessor +@mark.encode +def test_pair_encoder_inverse_transform(scraped_mast_file): + data = asarray(["ir", "ir", "uvis", "wfc"], dtype=object) + detector_keys = {"hrc": 0, "ir": 1, "sbc": 2, "uvis": 3, "wfc": 4} + enc = PairEncoder() + enc.fit(data, detector_keys) + enc.transform() + assert enc.transformed == [1, 1, 3, 4] + enc.inverse_transform() + assert enc.inversed == list(data)