COINtoolbox · johannct · Jul 23, 2023 · Jul 23, 2023 · Jul 24, 2023 · Jul 24, 2023
diff --git a/docs/canonical.rst b/docs/canonical.rst
@@ -32,9 +32,9 @@ but composed of different objects.
 
    >>> # define variables
    >>> data_dir = 'data/SIMGEN_PUBLIC_DES/'
-   >>> output_sample_file = 'results/Bazin_SNPCC_canonical.dat'
-   >>> output_metadata_file = 'results/Bazin_metadata.dat'
-   >>> features_file = 'results/Bazin.dat'
+   >>> output_sample_file = 'results/Bazin_SNPCC_canonical.csv'
+   >>> output_metadata_file = 'results/Bazin_metadata.csv'
+   >>> features_file = 'results/Bazin.csv'
 
    >>> sample = build_snpcc_canonical(path_to_raw_data=data_dir, path_to_features=features_file,
    >>>                               output_canonical_file=output_sample_file,
@@ -68,11 +68,8 @@ In the command line, using the same parameters as in the code above, you can do
     >>>       -o <output file for canonical sample> -p <comparison plot file>
     >>>       -s <if True save metadata to file>
 
-You can check that the file ``results/Bazin_SNPCC_canonical.dat`` is very similar to the original features file.
-The only difference is that now a few of the ``sample`` variables are set to ``queryable``:
-
-.. literalinclude:: images/sample_canonical.dat
- :lines: 1-2, 9-14
+You can check that the file ``results/Bazin_SNPCC_canonical.csv`` is very similar to the original features file.
+The only difference is that now a few of the ``queryable`` column marks the Canonical sample as ``True''.
 
 This means that you can use the :py:mod:`resspect.learn_loop` module in combination with a ``RandomSampling`` strategy but
 reading data from the canonical sample. In this way, at each iteration the code will select a random object from the test sample, 

diff --git a/docs/images/sample_canonical.dat b/docs/images/sample_canonical.dat
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ authors = [
 ]
 license = { text = "GPL-3.0-or-later" }
 dependencies = [
-    "flask==2.1.1",
+    "flask==2.3.2",
     "astropy>=5.2.1",
     "matplotlib>=3.7.0",
     "numpy>=1.24.2",

diff --git a/resspect/__init__.py b/resspect/__init__.py
@@ -69,13 +69,11 @@
            'fit_bump',
            'fit_scipy',
            'fit_snpcc',
-           'fit_plasticc_bazin',
-           'fit_resspect_bazin',
+           'fit_plasticc',
            'fom',
            'get_cosmo_metric',
            'get_snpcc_metric',
            'get_SNR_headers',
-           #'gradient_boosted_trees',
            'knn',
            'learn_loop',
            'load_dataset',

diff --git a/resspect/build_snpcc_canonical.py b/resspect/build_snpcc_canonical.py
@@ -131,10 +131,10 @@ def snpcc_get_canonical_info(self, path_to_rawdata: str,
             if not canonical_input_file:
                 raise ValueError('File not found! Set "calculate = True" '
                                  'to build canonical info file.')
-            self.meta_data = pd.read_csv(canonical_input_file, sep=' ',
+            self.meta_data = pd.read_csv(canonical_input_file,
                                          index_col=False)
         if save:
-            self.meta_data.to_csv(canonical_output_file, sep=' ', index=False)
+            self.meta_data.to_csv(canonical_output_file, index=False)
 
     def get_light_curves_meta_data(self, path_to_rawdata: str) -> list:
         light_cure_files = _get_files_list(path_to_rawdata)
@@ -269,7 +269,7 @@ def find_neighbors(self, number_of_neighbors: int = 10,
             nearest_neighbor_class = NearestNeighbors(
                 n_neighbors=number_of_neighbors,
                 algorithm=nearest_neighbor_algorithm)
-            current_train_sample = train_samples[index].values
+            current_train_sample = train_samples[index]
             nearest_neighbor_indices.append(
                 self._get_nearest_neighbor_indices(nearest_neighbor_class,
                                                    each_test_sample,
@@ -319,18 +319,19 @@ def get_meta_data_from_features(path_to_features: str,
     path_to_features: str
         Complete path to Bazin features files
     features_method: str (optional)
-        Method for feature extraction. Only 'Bazin' is implemented.
+        Method for feature extraction. Only 'bazin' is implemented.
     """
     data = DataBase()
-    data.load_features(path_to_file=path_to_features, method=features_method,
+    data.load_features(path_to_file=path_to_features, feature_extractor=features_method,
                        screen=screen)
     return data
 
 
 def build_snpcc_canonical(path_to_raw_data: str, path_to_features: str,
                           output_canonical_file: str, output_info_file='',
                           compute=True, save=True, input_info_file='',
-                          features_method='Bazin', screen=False):
+                          features_method='bazin', screen=False,
+                          number_of_neighbors=1):
     """Build canonical sample for SNPCC data.
 
     Parameters
@@ -357,7 +358,9 @@ def build_snpcc_canonical(path_to_raw_data: str, path_to_features: str,
         Save simulation metadata information to file.
         Default is True.
     screen: bool (optional)
-            If true, display steps info on screen. Default is False.
+        If true, display steps info on screen. Default is False.
+    number_of_neighbors: int (optional)
+        Number of neighbors in each sample. Default is 1.
 
     Returns
     -------
@@ -376,24 +379,23 @@ def build_snpcc_canonical(path_to_raw_data: str, path_to_features: str,
     # identify samples
     sample.snpcc_identify_samples()
     # find neighbors
-    sample.find_neighbors()
+    sample.find_neighbors(number_of_neighbors=number_of_neighbors)
 
     # get metadata from features file
     features_data = get_meta_data_from_features(
         path_to_features, features_method, screen)
     sample.header = features_data.metadata
     # identify new samples
 
-    features_data.metadata["queryable"][
-        features_data.metadata["id"].isin(sample.canonical_ids)] = True
-    features_data.metadata["queryable"][
-        ~features_data.metadata["id"].isin(sample.canonical_ids)] = False
+    flag = features_data.metadata["id"].isin(sample.canonical_ids)
+    for i in range(flag.shape[0]):
+        features_data.metadata.at[i, 'queryable'] = flag[i]
 
     # save to file
     features = pd.DataFrame(features_data.features,
                             columns=features_data.features_names)
     features_data.data = pd.concat([features_data.metadata, features], axis=1)
-    features_data.data.to_csv(output_canonical_file, sep=' ', index=False)
+    features_data.data.to_csv(output_canonical_file, index=False)
 
     # update Canonical object
     sample.canonical_sample = features_data.data

diff --git a/resspect/database.py b/resspect/database.py
@@ -256,8 +256,6 @@ def load_features_from_file(self, path_to_features_file: str, screen=False,
 
         else:
             data = pd.read_csv(path_to_features_file, index_col=False)
-            if 'redshift' not in data.keys():
-                data = pd.read_csv(path_to_features_file, index_col=False)
 
         # check if queryable is there
         if 'queryable' not in data.keys():
@@ -420,9 +418,50 @@ def load_photometry_features(self, path_to_photometry_file: str,
                 print('\n Loaded ', self.test_metadata.shape[0],
                       ' samples! \n')
 
+    def query_db(sample):
+        raise NotImplementedError
+
+    def load_photometry_from_db(sample=None):
+        # sample can be None, train, validation, pool, test
+        # depending on what kind of dataset is being built by this call
+        # resspect keeps each of these in separate variables
+        # None means that the call to the db determines by itself the split
+        # into subsets, for instance if the DB has already a notion of these
+        # subsets.
+        #
+        # The query returns a table 'data' that must have
+        # - the features for the subset of type 'sample'
+        # - the metadata, with keys 'id', 'redshift', 'type', 'code',
+        #   'orig_sample', 'queryable', and optionally last_rmag
+        #   and cost_<name> where name is a telescope name
+
+        d, md = query_db(sample)
+
+        if sample == None:
+            self.features = d
+            self.metadata = md
+
+        elif sample == 'train':
+            self.train_features = d
+            self.train_metadata = md
+
+        elif sample == 'test':
+            self.test_features = d
+            self.test_metadata = md
+
+        elif sample == 'validation':
+            self.validation_features = d
+            self.validation_metadata = md
+
+        elif sample == 'pool':
+            self.pool_features = d
+            self.pool_metadata = md
+
+
     def load_features(self, path_to_file: str, feature_extractor: str ='bazin',
                       screen=False, survey='DES', sample=None ):
-        """Load features according to the chosen feature extraction method.
+        """
+        Load features according to the chosen feature extraction method.
 
         Populates properties: data, features, feature_list, header
         and header_list.
@@ -447,7 +486,9 @@ def load_features(self, path_to_file: str, feature_extractor: str ='bazin',
             else, read independent files for 'train' and 'test'.
             Default is None.
         """
-        if feature_extractor == "photometry":
+        if feature_extractor == "db":
+            self.load_photometry_from_db(sample)
+        elif feature_extractor == "photometry":
             self.load_photometry_features(path_to_file, screen=screen,
                                           survey=survey, sample=sample)
         elif feature_extractor in FEATURE_EXTRACTOR_MAPPING:

diff --git a/resspect/feature_extractors/bazin.py b/resspect/feature_extractors/bazin.py
@@ -13,8 +13,8 @@
 
 
 class BazinFeatureExtractor(LightCurve):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, lc=None):
+        super().__init__(lc)
         self.features_names = ['a', 'b', 't0', 'tfall', 'trise']
 
     def evaluate(self, time: np.array) -> dict:

diff --git a/resspect/feature_extractors/bump.py b/resspect/feature_extractors/bump.py
@@ -11,8 +11,8 @@
 
 
 class BumpFeatureExtractor(LightCurve):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, lc=None):
+        super().__init__(lc)
         self.features_names = ['p1', 'p2', 'p3', 'time_shift', 'max_flux']
 
     def evaluate(self, time: np.array) -> dict:

diff --git a/resspect/feature_extractors/light_curve.py b/resspect/feature_extractors/light_curve.py
@@ -18,6 +18,7 @@
 import logging
 from typing import Tuple
 import warnings
+import os
 
 import numpy as np
 import pandas as pd
@@ -28,12 +29,8 @@
 from resspect.lightcurves_utils import get_photometry_with_id_name_and_snid
 from resspect.lightcurves_utils import read_plasticc_full_photometry_data
 from resspect.lightcurves_utils import load_plasticc_photometry_df
-from resspect.lightcurves_utils import read_resspect_full_photometry_data
-from resspect.lightcurves_utils import insert_band_column_to_resspect_df
-from resspect.lightcurves_utils import load_resspect_photometry_df
 from resspect.lightcurves_utils import get_snpcc_sntype
 
-
 warnings.filterwarnings("ignore", category=RuntimeWarning)
 logging.basicConfig(level=logging.INFO)
 
@@ -105,7 +102,15 @@ class LightCurve:
 
     """
 
-    def __init__(self):
+    def __init__(self, lc=None):
+        if lc is None:
+            self._non_copy_constructor()
+        else:
+            if not isinstance(lc, LightCurve):
+                raise RuntimeError("argument is not a LightCurve object: "+type(lc))
+            self._copy_constructor(lc)
+
+    def _non_copy_constructor(self):
         self.queryable = None
         self.features = []
         #self.features_names = ['p1', 'p2', 'p3', 'time_shift', 'max_flux']
@@ -124,6 +129,43 @@ def __init__(self):
         self.sncode = 0
         self.sntype = ' '
 
+    def _copy_constructor(self, lc):
+        self.queryable = lc.queryable
+        self.features = lc.features
+        self.dataset_name = lc.dataset_name
+        self.exp_time = lc.exp_time
+        self.filters = lc.filters
+        self.full_photometry = lc.full_photometry
+        self.id = lc.id
+        self.id_name = lc.id_name
+        self.last_mag = lc.last_mag
+        self.photometry = lc.photometry
+        self.redshift = lc.redshift
+        self.sample = lc.sample
+        self.sim_peakmag = lc.sim_peakmag
+        self.sim_pkmjd = lc.sim_pkmjd
+        self.sncode = lc.sncode
+        self.sntype = lc.sntype
+
+    @staticmethod
+    def from_file(filename: str) -> list:
+        light_curves = []
+        with open(filename, 'r') as f:
+            for ff in f.readlines():
+                survey, path = ff.split()
+                if not os.path.exists(path):
+                    raise FileNotFoundError('File Not found: '+path)
+                survey = survey.strip().upper()
+                lc = LightCurve()
+                if survey == "SNPCC":
+                    lc.load_snpcc_lc(path)
+                elif survey == "PLASTICC":
+                    lc.load_plasticc_lc(path)
+                else:
+                    raise NameError("survey argument not recognized: "+survey)
+                light_curves.append(lc)
+        return light_curves
+
     def _get_snpcc_photometry_raw_and_header(
             self, lc_data: np.ndarray,
             sntype_test_value: str = "-9") -> Tuple[np.ndarray, list]:
@@ -180,38 +222,10 @@ def load_snpcc_lc(self, path_to_data: str):
         photometry_raw, header = self._get_snpcc_photometry_raw_and_header(
             lc_data)
 
+        self.metadata = [self.id, self.redshift, self.sntype, self.sncode, self.sample]
         if photometry_raw.size > 0:
             self.photometry = load_snpcc_photometry_df(photometry_raw, header)
 
-    def load_resspect_lc(self, photo_file: str, snid: int):
-        """
-        Return 1 light curve from RESSPECT simulations.
-
-        Parameters
-        ----------
-        photo_file: str
-            Complete path to light curves file.
-        snid: int
-            Identification number for the desired light curve.
-        """
-
-        self.dataset_name = 'RESSPECT'
-        self.filters = ['u', 'g', 'r', 'i', 'z', 'Y']
-        self.id = snid
-
-        if self.full_photometry.empty:
-            _, self.full_photometry = read_resspect_full_photometry_data(
-                photo_file)
-        id_names_list = ['SNID', 'snid', 'objid', 'id']
-        filtered_photometry, self.id_name = (
-            get_photometry_with_id_name_and_snid(
-                self.full_photometry, id_names_list, snid))
-
-        if not filtered_photometry.empty:
-            filtered_photometry = insert_band_column_to_resspect_df(
-                filtered_photometry, self.filters)
-            self.photometry = load_resspect_photometry_df(filtered_photometry)
-
     def load_plasticc_lc(self, photo_file: str, snid: int):
         """
         Return 1 light curve from PLAsTiCC simulations.