Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TOM interface preparation #148

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions docs/canonical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ but composed of different objects.

>>> # define variables
>>> data_dir = 'data/SIMGEN_PUBLIC_DES/'
>>> output_sample_file = 'results/Bazin_SNPCC_canonical.dat'
>>> output_metadata_file = 'results/Bazin_metadata.dat'
>>> features_file = 'results/Bazin.dat'
>>> output_sample_file = 'results/Bazin_SNPCC_canonical.csv'
>>> output_metadata_file = 'results/Bazin_metadata.csv'
>>> features_file = 'results/Bazin.csv'

>>> sample = build_snpcc_canonical(path_to_raw_data=data_dir, path_to_features=features_file,
>>> output_canonical_file=output_sample_file,
Expand Down Expand Up @@ -68,11 +68,8 @@ In the command line, using the same parameters as in the code above, you can do
>>> -o <output file for canonical sample> -p <comparison plot file>
>>> -s <if True save metadata to file>

You can check that the file ``results/Bazin_SNPCC_canonical.dat`` is very similar to the original features file.
The only difference is that now a few of the ``sample`` variables are set to ``queryable``:

.. literalinclude:: images/sample_canonical.dat
:lines: 1-2, 9-14
You can check that the file ``results/Bazin_SNPCC_canonical.csv`` is very similar to the original features file.
The only difference is that now a few of the ``queryable`` column marks the Canonical sample as ``True''.

This means that you can use the :py:mod:`resspect.learn_loop` module in combination with a ``RandomSampling`` strategy but
reading data from the canonical sample. In this way, at each iteration the code will select a random object from the test sample,
Expand Down
21,285 changes: 0 additions & 21,285 deletions docs/images/sample_canonical.dat

This file was deleted.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ authors = [
]
license = { text = "GPL-3.0-or-later" }
dependencies = [
"flask==2.1.1",
"flask==2.3.2",
"astropy>=5.2.1",
"matplotlib>=3.7.0",
"numpy>=1.24.2",
Expand Down
4 changes: 1 addition & 3 deletions resspect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,11 @@
'fit_bump',
'fit_scipy',
'fit_snpcc',
'fit_plasticc_bazin',
'fit_resspect_bazin',
'fit_plasticc',
'fom',
'get_cosmo_metric',
'get_snpcc_metric',
'get_SNR_headers',
#'gradient_boosted_trees',
'knn',
'learn_loop',
'load_dataset',
Expand Down
28 changes: 15 additions & 13 deletions resspect/build_snpcc_canonical.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,10 @@ def snpcc_get_canonical_info(self, path_to_rawdata: str,
if not canonical_input_file:
raise ValueError('File not found! Set "calculate = True" '
'to build canonical info file.')
self.meta_data = pd.read_csv(canonical_input_file, sep=' ',
self.meta_data = pd.read_csv(canonical_input_file,
index_col=False)
if save:
self.meta_data.to_csv(canonical_output_file, sep=' ', index=False)
self.meta_data.to_csv(canonical_output_file, index=False)

def get_light_curves_meta_data(self, path_to_rawdata: str) -> list:
light_cure_files = _get_files_list(path_to_rawdata)
Expand Down Expand Up @@ -269,7 +269,7 @@ def find_neighbors(self, number_of_neighbors: int = 10,
nearest_neighbor_class = NearestNeighbors(
n_neighbors=number_of_neighbors,
algorithm=nearest_neighbor_algorithm)
current_train_sample = train_samples[index].values
current_train_sample = train_samples[index]
nearest_neighbor_indices.append(
self._get_nearest_neighbor_indices(nearest_neighbor_class,
each_test_sample,
Expand Down Expand Up @@ -319,18 +319,19 @@ def get_meta_data_from_features(path_to_features: str,
path_to_features: str
Complete path to Bazin features files
features_method: str (optional)
Method for feature extraction. Only 'Bazin' is implemented.
Method for feature extraction. Only 'bazin' is implemented.
"""
data = DataBase()
data.load_features(path_to_file=path_to_features, method=features_method,
data.load_features(path_to_file=path_to_features, feature_extractor=features_method,
screen=screen)
return data


def build_snpcc_canonical(path_to_raw_data: str, path_to_features: str,
output_canonical_file: str, output_info_file='',
compute=True, save=True, input_info_file='',
features_method='Bazin', screen=False):
features_method='bazin', screen=False,
number_of_neighbors=1):
"""Build canonical sample for SNPCC data.

Parameters
Expand All @@ -357,7 +358,9 @@ def build_snpcc_canonical(path_to_raw_data: str, path_to_features: str,
Save simulation metadata information to file.
Default is True.
screen: bool (optional)
If true, display steps info on screen. Default is False.
If true, display steps info on screen. Default is False.
number_of_neighbors: int (optional)
Number of neighbors in each sample. Default is 1.

Returns
-------
Expand All @@ -376,24 +379,23 @@ def build_snpcc_canonical(path_to_raw_data: str, path_to_features: str,
# identify samples
sample.snpcc_identify_samples()
# find neighbors
sample.find_neighbors()
sample.find_neighbors(number_of_neighbors=number_of_neighbors)

# get metadata from features file
features_data = get_meta_data_from_features(
path_to_features, features_method, screen)
sample.header = features_data.metadata
# identify new samples

features_data.metadata["queryable"][
features_data.metadata["id"].isin(sample.canonical_ids)] = True
features_data.metadata["queryable"][
~features_data.metadata["id"].isin(sample.canonical_ids)] = False
flag = features_data.metadata["id"].isin(sample.canonical_ids)
for i in range(flag.shape[0]):
features_data.metadata.at[i, 'queryable'] = flag[i]

# save to file
features = pd.DataFrame(features_data.features,
columns=features_data.features_names)
features_data.data = pd.concat([features_data.metadata, features], axis=1)
features_data.data.to_csv(output_canonical_file, sep=' ', index=False)
features_data.data.to_csv(output_canonical_file, index=False)

# update Canonical object
sample.canonical_sample = features_data.data
Expand Down
49 changes: 45 additions & 4 deletions resspect/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,6 @@ def load_features_from_file(self, path_to_features_file: str, screen=False,

else:
data = pd.read_csv(path_to_features_file, index_col=False)
if 'redshift' not in data.keys():
data = pd.read_csv(path_to_features_file, index_col=False)

# check if queryable is there
if 'queryable' not in data.keys():
Expand Down Expand Up @@ -420,9 +418,50 @@ def load_photometry_features(self, path_to_photometry_file: str,
print('\n Loaded ', self.test_metadata.shape[0],
' samples! \n')

def query_db(sample):
raise NotImplementedError

def load_photometry_from_db(sample=None):
# sample can be None, train, validation, pool, test
# depending on what kind of dataset is being built by this call
# resspect keeps each of these in separate variables
# None means that the call to the db determines by itself the split
# into subsets, for instance if the DB has already a notion of these
# subsets.
#
# The query returns a table 'data' that must have
# - the features for the subset of type 'sample'
# - the metadata, with keys 'id', 'redshift', 'type', 'code',
# 'orig_sample', 'queryable', and optionally last_rmag
# and cost_<name> where name is a telescope name

d, md = query_db(sample)

if sample == None:
self.features = d
self.metadata = md

elif sample == 'train':
self.train_features = d
self.train_metadata = md

elif sample == 'test':
self.test_features = d
self.test_metadata = md

elif sample == 'validation':
self.validation_features = d
self.validation_metadata = md

elif sample == 'pool':
self.pool_features = d
self.pool_metadata = md


def load_features(self, path_to_file: str, feature_extractor: str ='bazin',
screen=False, survey='DES', sample=None ):
"""Load features according to the chosen feature extraction method.
"""
Load features according to the chosen feature extraction method.

Populates properties: data, features, feature_list, header
and header_list.
Expand All @@ -447,7 +486,9 @@ def load_features(self, path_to_file: str, feature_extractor: str ='bazin',
else, read independent files for 'train' and 'test'.
Default is None.
"""
if feature_extractor == "photometry":
if feature_extractor == "db":
self.load_photometry_from_db(sample)
elif feature_extractor == "photometry":
self.load_photometry_features(path_to_file, screen=screen,
survey=survey, sample=sample)
elif feature_extractor in FEATURE_EXTRACTOR_MAPPING:
Expand Down
4 changes: 2 additions & 2 deletions resspect/feature_extractors/bazin.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@


class BazinFeatureExtractor(LightCurve):
def __init__(self):
super().__init__()
def __init__(self, lc=None):
super().__init__(lc)
self.features_names = ['a', 'b', 't0', 'tfall', 'trise']

def evaluate(self, time: np.array) -> dict:
Expand Down
4 changes: 2 additions & 2 deletions resspect/feature_extractors/bump.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@


class BumpFeatureExtractor(LightCurve):
def __init__(self):
super().__init__()
def __init__(self, lc=None):
super().__init__(lc)
self.features_names = ['p1', 'p2', 'p3', 'time_shift', 'max_flux']

def evaluate(self, time: np.array) -> dict:
Expand Down
82 changes: 48 additions & 34 deletions resspect/feature_extractors/light_curve.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import logging
from typing import Tuple
import warnings
import os

import numpy as np
import pandas as pd
Expand All @@ -28,12 +29,8 @@
from resspect.lightcurves_utils import get_photometry_with_id_name_and_snid
from resspect.lightcurves_utils import read_plasticc_full_photometry_data
from resspect.lightcurves_utils import load_plasticc_photometry_df
from resspect.lightcurves_utils import read_resspect_full_photometry_data
from resspect.lightcurves_utils import insert_band_column_to_resspect_df
from resspect.lightcurves_utils import load_resspect_photometry_df
from resspect.lightcurves_utils import get_snpcc_sntype


warnings.filterwarnings("ignore", category=RuntimeWarning)
logging.basicConfig(level=logging.INFO)

Expand Down Expand Up @@ -105,7 +102,15 @@ class LightCurve:

"""

def __init__(self):
def __init__(self, lc=None):
if lc is None:
self._non_copy_constructor()
else:
if not isinstance(lc, LightCurve):
raise RuntimeError("argument is not a LightCurve object: "+type(lc))
self._copy_constructor(lc)

def _non_copy_constructor(self):
self.queryable = None
self.features = []
#self.features_names = ['p1', 'p2', 'p3', 'time_shift', 'max_flux']
Expand All @@ -124,6 +129,43 @@ def __init__(self):
self.sncode = 0
self.sntype = ' '

def _copy_constructor(self, lc):
self.queryable = lc.queryable
self.features = lc.features
self.dataset_name = lc.dataset_name
self.exp_time = lc.exp_time
self.filters = lc.filters
self.full_photometry = lc.full_photometry
self.id = lc.id
self.id_name = lc.id_name
self.last_mag = lc.last_mag
self.photometry = lc.photometry
self.redshift = lc.redshift
self.sample = lc.sample
self.sim_peakmag = lc.sim_peakmag
self.sim_pkmjd = lc.sim_pkmjd
self.sncode = lc.sncode
self.sntype = lc.sntype

@staticmethod
def from_file(filename: str) -> list:
light_curves = []
with open(filename, 'r') as f:
for ff in f.readlines():
survey, path = ff.split()
if not os.path.exists(path):
raise FileNotFoundError('File Not found: '+path)
survey = survey.strip().upper()
lc = LightCurve()
if survey == "SNPCC":
lc.load_snpcc_lc(path)
elif survey == "PLASTICC":
lc.load_plasticc_lc(path)
else:
raise NameError("survey argument not recognized: "+survey)
light_curves.append(lc)
return light_curves

def _get_snpcc_photometry_raw_and_header(
self, lc_data: np.ndarray,
sntype_test_value: str = "-9") -> Tuple[np.ndarray, list]:
Expand Down Expand Up @@ -180,38 +222,10 @@ def load_snpcc_lc(self, path_to_data: str):
photometry_raw, header = self._get_snpcc_photometry_raw_and_header(
lc_data)

self.metadata = [self.id, self.redshift, self.sntype, self.sncode, self.sample]
if photometry_raw.size > 0:
self.photometry = load_snpcc_photometry_df(photometry_raw, header)

def load_resspect_lc(self, photo_file: str, snid: int):
"""
Return 1 light curve from RESSPECT simulations.

Parameters
----------
photo_file: str
Complete path to light curves file.
snid: int
Identification number for the desired light curve.
"""

self.dataset_name = 'RESSPECT'
self.filters = ['u', 'g', 'r', 'i', 'z', 'Y']
self.id = snid

if self.full_photometry.empty:
_, self.full_photometry = read_resspect_full_photometry_data(
photo_file)
id_names_list = ['SNID', 'snid', 'objid', 'id']
filtered_photometry, self.id_name = (
get_photometry_with_id_name_and_snid(
self.full_photometry, id_names_list, snid))

if not filtered_photometry.empty:
filtered_photometry = insert_band_column_to_resspect_df(
filtered_photometry, self.filters)
self.photometry = load_resspect_photometry_df(filtered_photometry)

def load_plasticc_lc(self, photo_file: str, snid: int):
"""
Return 1 light curve from PLAsTiCC simulations.
Expand Down
Loading