improved performance and updated API for interpolation

johannesulf · Jun 30, 2020 · 395d4dc · 395d4dc
1 parent 0a64f88
commit 395d4dc
Show file tree

Hide file tree

Showing 2 changed files with 144 additions and 164 deletions.
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup
 
 setup(name='tabcorr',
-      version='0.5.1',
+      version='0.6',
       description='Tabulated correlation functions for halotools',
       url='https://github.com/johannesulf/TabCorr',
       author='Johannes Ulf Lange',

diff --git a/tabcorr/tabcorr.py b/tabcorr/tabcorr.py
@@ -325,6 +325,13 @@ def tabulate(cls, halocat, tpcf, *tpcf_args,
         if project_xyz and mode == 'auto':
             tpcf_matrix /= 3.0
 
+        if mode == 'auto':
+            tpcf_matrix_flat = []
+            for i in range(tpcf_matrix.shape[0]):
+                tpcf_matrix_flat.append(symmetric_matrix_to_array(
+                    tpcf_matrix[i]))
+            tpcf_matrix = np.array(tpcf_matrix_flat)
+
         halotab.attrs = {}
         halotab.attrs['tpcf'] = tpcf.__name__
         halotab.attrs['mode'] = mode
@@ -367,15 +374,7 @@ def read(cls, fname):
         for key in fstream.attrs.keys():
             halotab.attrs[key] = fstream.attrs[key]
 
-        tpcf_matrix = fstream['tpcf_matrix'][()]
-        if halotab.attrs['mode'] == 'auto' and len(tpcf_matrix.shape) == 2:
-            halotab.tpcf_matrix = []
-            for i in range(tpcf_matrix.shape[0]):
-                halotab.tpcf_matrix.append(array_to_symmetric_matrix(
-                    tpcf_matrix[i]))
-            halotab.tpcf_matrix = np.array(halotab.tpcf_matrix)
-        else:
-            halotab.tpcf_matrix = tpcf_matrix
+        halotab.tpcf_matrix = fstream['tpcf_matrix'][()]
 
         halotab.tpcf_args = []
         for key in fstream['tpcf_args'].keys():
@@ -420,15 +419,7 @@ def write(self, fname, overwrite=False, max_args_size=1000000,
         for key in keys:
             fstream.attrs[key] = self.attrs[key]
 
-        if self.attrs['mode'] == 'auto':
-            tpcf_matrix_write = []
-            for i in range(self.tpcf_matrix.shape[0]):
-                tpcf_matrix_write.append(symmetric_matrix_to_array(
-                    self.tpcf_matrix[i]))
-            fstream['tpcf_matrix'] = np.array(tpcf_matrix_write,
-                                              dtype=matrix_dtype)
-        else:
-            fstream['tpcf_matrix'] = self.tpcf_matrix.astype(matrix_dtype)
+        fstream['tpcf_matrix'] = self.tpcf_matrix.astype(matrix_dtype)
 
         for i, arg in enumerate(self.tpcf_args):
             if (type(arg) is not np.ndarray or
@@ -525,16 +516,16 @@ def predict(self, model, separate_gal_type=False, **occ_kwargs):
         ngal = mean_occupation * self.gal_type['n_h'].data
 
         if self.attrs['mode'] == 'auto':
-            xi = self.tpcf_matrix * np.outer(ngal, ngal) / np.sum(ngal)**2
+            ngal_sq = np.outer(ngal, ngal)
+            ngal_sq = 2 * ngal_sq - np.diag(np.diag(ngal_sq))
+            ngal_sq = symmetric_matrix_to_array(ngal_sq)
+            xi = self.tpcf_matrix * ngal_sq / np.sum(ngal_sq)
         elif self.attrs['mode'] == 'cross':
             xi = self.tpcf_matrix * ngal / np.sum(ngal)
 
         if not separate_gal_type:
             ngal = np.sum(ngal)
-            if self.attrs['mode'] == 'auto':
-                xi = np.sum(xi, axis=(1, 2)).reshape(self.tpcf_shape)
-            elif self.attrs['mode'] == 'cross':
-                xi = np.sum(xi, axis=1).reshape(self.tpcf_shape)
+            xi = np.sum(xi, axis=1).reshape(self.tpcf_shape)
             return ngal, xi
         else:
             ngal_dict = {}
@@ -545,17 +536,17 @@ def predict(self, model, separate_gal_type=False, **occ_kwargs):
                 ngal_dict[gal_type] = np.sum(ngal[mask])
 
             if self.attrs['mode'] == 'auto':
-                grid = np.meshgrid(self.gal_type['gal_type'],
-                                   self.gal_type['gal_type'])
                 for gal_type_1, gal_type_2 in (
                         itertools.combinations_with_replacement(
                             np.unique(self.gal_type['gal_type']), 2)):
-                    mask = (((gal_type_1 == grid[0]) &
-                             (gal_type_2 == grid[1])) |
-                            ((gal_type_1 == grid[1]) &
-                             (gal_type_2 == grid[0])))
+                    mask = symmetric_matrix_to_array(np.outer(
+                        gal_type_1 == self.gal_type['gal_type'],
+                        gal_type_2 == self.gal_type['gal_type']) |
+                            np.outer(
+                        gal_type_2 == self.gal_type['gal_type'],
+                        gal_type_1 == self.gal_type['gal_type']))
                     xi_dict['%s-%s' % (gal_type_1, gal_type_2)] = np.sum(
-                        xi * mask, axis=(1, 2)).reshape(self.tpcf_shape)
+                        xi * mask, axis=1).reshape(self.tpcf_shape)
 
             elif self.attrs['mode'] == 'cross':
                 for gal_type in np.unique(self.gal_type['gal_type']):
@@ -566,166 +557,155 @@ def predict(self, model, separate_gal_type=False, **occ_kwargs):
             return ngal_dict, xi_dict
 
 
-def interpolate_predict(tabcorr_arr, x, xi, model, extrapolate=True,
-                        **occ_kwargs):
-    """
-    Linearly interprets the predictions from multiple TabCorr instances. Each
-    TabCorr instance will have a corresponding x-value and this function
-    finds a linear/barycentric interpolation for an intermediate point xi. For
-    example, this function can be used for predict correlation functions
-    for continues choices of concentrations for satellites.
-
-    Parameters
-    ----------
-    tabcorr_arr : array_like
-        TabCorr instances used to interpolate.
-
-    x : array_like
-        Array of shape (npts) or (npts, ndim). Here, npts is the number of
-        TabCorr instances and ndim the number of dimensions over which
-        we will interpolate.
-
-    xi : float or array_like
-        x-value at which to interpolate. If x has shape (npts), xi must have
-        be a float. On the other hand, if x has shape (npts, ndim), xi must
-        be an array of len ndim.
-
-    model : HodModelFactory
-        Instance of ``halotools.empirical_models.HodModelFactory``
-        describing the model for which predictions are made.
-
-    extrapolate : boolean, optional
-        Whether to allow extrapolation beyond points sampled by x. If set to
-        False, attempting to extrapolate will result in a RuntimeError.
-
-    **occ_kwargs : dict, optional
-            Keyword arguments passed to the ``mean_occupation`` functions of
-            the model.
-
-    Returns
-    -------
-    ngal : numpy.array or dict
-        Array containing the number densities for each galaxy type stored in
-        self.gal_type. The total galaxy number density is the sum of all
-        elements of this array.
-
-    xi : numpy.array or dict
-        Array storing the prediction for the correlation function.
-    """
+class TabCorrInterpolation:
 
-    try:
-        assert isinstance(x, list) or isinstance(x, np.ndarray)
-        x = np.array(x)
-        assert (x.ndim == 1) or (x.ndim == 2)
-    except AssertionError:
-        raise RuntimeError('x must be a one or two-dimensional array.')
+    def __init__(self, tabcorr_list, param_dict_table):
+        """
+        Initialize an interpolation of multiple TabCorr instances.
 
-    if x.ndim == 1:
-        n_points = x.shape[0]
-        n_dim = 1
-    elif x.ndim == 2:
-        n_points = x.shape[0]
-        n_dim = x.shape[1]
-    if n_points <= n_dim:
-        raise RuntimeError('x must contain more points than dimensions.')
+        Parameters
+        ----------
+        tabcorr_list : array_like
+            TabCorr instances used to interpolate.
 
-    try:
-        if n_dim == 1:
-            assert isinstance(xi, float)
+        param_dict_table : astropy.table.Table
+            Table containing the keywords and values corresponding to the
+            TabCorr list. Must have the same length and ordering as
+            tabcorr_list.
+        """
+
+        if len(tabcorr_list) != len(param_dict_table):
+            raise RuntimeError('The length of tabcorr_list does not match ' +
+                               'the number of entries in param_dict_table.')
+
+        self.keys = param_dict_table.colnames
+        self.n_dim = len(self.keys)
+        self.n_pts = len(param_dict_table)
+        self.tabcorr_list = tabcorr_list
+
+        if self.n_pts < self.n_dim + 1:
+            raise RuntimeError('The number of TabCorr instances provided ' +
+                               'must be at least the number of dimensions ' +
+                               'plus 1.')
+
+        if self.n_dim == 0:
+            raise RuntimeError('param_dict_table is empty.')
+        if self.n_dim == 1:
+            self.x = param_dict_table.columns[0].data
         else:
-            assert isinstance(xi, list) or isinstance(xi, np.ndarray)
-            xi = np.array(xi)
-            assert xi.ndim == 1
-            assert len(xi) == n_dim
-    except AssertionError:
-        raise RuntimeError('xi must match the dimensionality of x.')
+            self.x = np.empty((self.n_pts, self.n_dim))
+            for i, key in enumerate(param_dict_table.colnames):
+                self.x[:, i] = param_dict_table[key].data
+            self.delaunay = Delaunay(self.x)
 
-    if n_points != len(tabcorr_arr):
-        raise RuntimeError('The length of tabcorr_arr does not match the ' +
-                           'number of points provided via x.')
+    def predict(self, model, extrapolate=True, **occ_kwargs):
+        """
+        Linearly interpolate the predictions from multiple TabCorr instances.
+        For example, this function can be used for predict correlation
+        functions for continues choices of concentrations for satellites.
+        Parameters to linearly interpolate over should be in the parameter
+        dictionary of the model.
 
-    if n_dim > 1:
+        Parameters
+        ----------
+        model : HodModelFactory
+            Instance of ``halotools.empirical_models.HodModelFactory``
+            describing the model for which predictions are made.
 
-        tri = Delaunay(x)
-        i = tri.find_simplex(xi)
-        if i != -1:
-            s = tri.simplices[i]
-        if i == -1:
-            if not extrapolate:
-                raise RuntimeError('x is outside of the interpolation range.')
-            else:
-                x_cm = np.mean(x[tri.simplices], axis=1)
-                i = np.argmin(np.sum((xi - x_cm)**2, axis=1))  # closest simplex
+        separate_gal_type : boolean, optional
+            If True, the return values are dictionaries divided by each galaxy
+            types contribution to the output result.
 
-        s = tri.simplices[i]
-        b = tri.transform[i, :-1].dot(xi - tri.transform[i, -1])
-        w = np.append(b, 1 - np.sum(b))
+        extrapolate : boolean, optional
+            Whether to allow extrapolation beyond points sampled by x. If set
+            to False, attempting to extrapolate will result in a RuntimeError.
 
-    else:
+        **occ_kwargs : dict, optional
+                Keyword arguments passed to the ``mean_occupation`` functions
+                of the model.
 
-        if np.any(x < xi) and np.any(x > xi):
-            s = [np.ma.MaskedArray.argmax(np.ma.masked_array(x, mask=(x > xi))),
-                 np.ma.MaskedArray.argmin(np.ma.masked_array(x, mask=(x < xi)))]
-        else:
-            if not extrapolate:
-                raise RuntimeError('x is outside of the interpolation range.')
-            else:
-                s = np.argsort(np.abs(xi - x))[:2]
+        Returns
+        -------
+        ngal : numpy.array or dict
+            Array containing the number densities for each galaxy type stored
+            in self.gal_type. The total galaxy number density is the sum of all
+            elements of this array.
 
-        w = np.array([x[s[1]] - xi, xi - x[s[0]]]) / (x[s[1]] - x[s[0]])
+        xi : numpy.array or dict
+            Array storing the prediction for the correlation function.
+        """
 
-    for i in range(len(s)):
-        ngal_i, xi_i = tabcorr_arr[s[i]].predict(model, **occ_kwargs)
-        if i == 0:
-            ngal = ngal_i * w[i]
-            xi = xi_i * w[i]
-        else:
-            ngal += ngal_i * w[i]
-            xi += xi_i * w[i]
+        x_model = np.empty(self.n_dim)
+        for i in range(self.n_dim):
+            try:
+                x_model[i] = model.param_dict[self.keys[i]]
+            except KeyError:
+                raise RuntimeError('key {} not present '.format(self.keys[i]) +
+                                   'in the parameter dictionary of the model.')
 
-    return ngal, xi
+        if self.n_dim > 1:
 
+            i_simplex = self.delaunay.find_simplex(x_model)
 
-def symmetric_matrix_to_array(matrix):
+            if i_simplex == -1:
+                if not extrapolate:
+                    raise RuntimeError('The parameters of the model are ' +
+                                       'outside of the interpolation range ' +
+                                       'and extrapolation is turned off.')
+                else:
+                    x_cm = np.mean(self.x[self.delaunay.simplices], axis=1)
+                    i_simplex = np.argmin(np.sum((x_model - x_cm)**2, axis=1))
 
-    try:
-        assert matrix.shape[0] == matrix.shape[1]
-        assert np.all(matrix == matrix.T)
-    except AssertionError:
-        raise RuntimeError('The matrix you provided is not symmetric.')
+            simplex = self.delaunay.simplices[i_simplex]
+            b = self.delaunay.transform[i_simplex, :-1].dot(
+                x_model - self.delaunay.transform[i_simplex, -1])
+            w = np.append(b, 1 - np.sum(b))
 
-    array = np.zeros((np.prod(matrix.shape) + matrix.shape[0]) // 2,
-                     dtype=matrix.dtype)
+        else:
 
-    k = 0
-    for i in range(matrix.shape[0]):
-        for j in range(matrix.shape[1]):
-            if j > i:
-                continue
+            if np.any(x_model < self.x) and np.any(x_model > self.x):
+                simplex = [np.ma.MaskedArray.argmax(
+                    np.ma.masked_array(self.x, mask=(self.x > x_model))),
+                           np.ma.MaskedArray.argmin(
+                    np.ma.masked_array(self.x, mask=(self.x < x_model)))]
             else:
-                array[k] = matrix[i, j]
-                k = k + 1
-
-    return array
+                if not extrapolate:
+                    raise RuntimeError('The parameters of the model are ' +
+                                       'outside of the interpolation range ' +
+                                       'and extrapolation is turned off.')
+                else:
+                    simplex = np.argsort(np.abs(x_model - self.x))[:2]
+
+            w1 = (self.x[simplex[1]] - x_model) / (
+                self.x[simplex[1]] - self.x[simplex[0]])
+            w = [w1, 1 - w1]
+
+        for i, k in enumerate(simplex):
+            ngal_i, xi_i = self.tabcorr_list[k].predict(model, **occ_kwargs)
+            if i == 0:
+                ngal = ngal_i * w[i]
+                xi = xi_i * w[i]
+            else:
+                ngal += ngal_i * w[i]
+                xi += xi_i * w[i]
 
+        return ngal, xi
 
-def array_to_symmetric_matrix(array):
 
-    dim = int(np.rint(-0.5 + np.sqrt(0.25 + len(array) * 2)))
+def symmetric_matrix_to_array(matrix):
 
     try:
-        assert len(array) == (dim**2 + dim) / 2
+        assert matrix.shape[0] == matrix.shape[1]
+        assert np.all(matrix == matrix.T)
     except AssertionError:
-        raise RuntimeError('The length of the array does not correspond to a' +
-                           ' symmetric matrix.')
+        raise RuntimeError('The matrix you provided is not symmetric.')
 
-    matrix = np.zeros((dim, dim), dtype=array.dtype)
+    n_dim = matrix.shape[0]
+    sel = np.zeros((n_dim**2 + n_dim) // 2, dtype=np.int)
 
-    k = 0
     for i in range(matrix.shape[0]):
-        matrix[i, :i+1] = array[k:k+i+1]
-        k = k + i + 1
+        sel[(i*(i+1))//2:(i*(i+1))//2+(i+1)] = np.arange(
+            i*n_dim, i*n_dim + i + 1)
 
-    matrix = matrix + matrix.T - matrix * np.identity(matrix.shape[0])
+    return matrix.ravel()[sel]
 
-    return matrix