From 119dd6d617053d869d80d965a06f1cc22faa057a Mon Sep 17 00:00:00 2001 From: JosephGuman <105138387+JosephGuman@users.noreply.github.com> Date: Fri, 8 Mar 2024 15:26:00 -0800 Subject: [PATCH] Adding cov() functionality with tests (#1129) * Adding cov() functionality with tests. Base function largely taken from numpy's cov() implementation Signed-off-by: Joseph Guman * Update cunumeric/module.py Co-authored-by: Manolis Papadakis * Update cunumeric/module.py Co-authored-by: Manolis Papadakis * Update cunumeric/module.py Co-authored-by: Manolis Papadakis * Update cunumeric/module.py Co-authored-by: Manolis Papadakis * Update cunumeric/module.py Co-authored-by: Manolis Papadakis * Update cunumeric/module.py Co-authored-by: Manolis Papadakis * Additional minor changes for style and functionality signed-off-by: Joseph Guman * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fixing unresolvable style issue signed-off-by: Joseph Guman * Use clip to avoid creating an intermediate * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Joseph Thomas Guman Co-authored-by: Manolis Papadakis Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- cunumeric/array.py | 14 ++ cunumeric/module.py | 162 +++++++++++++++++++++++ docs/cunumeric/source/api/statistics.rst | 1 + tests/integration/test_stats.py | 96 ++++++++++++++ 4 files changed, 273 insertions(+) diff --git a/cunumeric/array.py b/cunumeric/array.py index 8bfc5178a..3b628ae4d 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -2257,6 +2257,20 @@ def clip( Multiple GPUs, Multiple CPUs """ + min = ( + min + if min is not None + else np.iinfo(self.dtype).min + if self.dtype.kind == "i" + else -np.inf + ) + max = ( + max + if max is not None + else np.iinfo(self.dtype).max + if self.dtype.kind == "i" + else np.inf + ) args = ( np.array(min, dtype=self.dtype), np.array(max, dtype=self.dtype), diff --git a/cunumeric/module.py b/cunumeric/module.py index 4baf60bc8..4ee0ef07b 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -7545,6 +7545,168 @@ def var( ) +@add_boilerplate("m", "y", "fweights", "aweights") +def cov( + m: ndarray, + y: Optional[ndarray] = None, + rowvar: bool = True, + bias: bool = False, + ddof: Optional[int] = None, + fweights: Optional[ndarray] = None, + aweights: Optional[ndarray] = None, + *, + dtype: Optional[np.dtype[Any]] = None, +) -> ndarray: + """ + Estimate a covariance matrix, given data and weights. + + Covariance indicates the level to which two variables vary together. + If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`, + then the covariance matrix element :math:`C_{ij}` is the covariance of + :math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance + of :math:`x_i`. + + Parameters + ---------- + m : array_like + A 1-D or 2-D array containing multiple variables and observations. + Each row of `m` represents a variable, and each column a single + observation of all those variables. Also see `rowvar` below. + y : array_like, optional + An additional set of variables and observations. `y` has the same form + as that of `m`. + rowvar : bool, optional + If `rowvar` is True (default), then each row represents a + variable, with observations in the columns. Otherwise, the relationship + is transposed: each column represents a variable, while the rows + contain observations. + bias : bool, optional + Default normalization (False) is by ``(N - 1)``, where ``N`` is the + number of observations given (unbiased estimate). If `bias` is True, + then normalization is by ``N``. These values can be overridden by using + the keyword ``ddof``. + ddof : int, optional + If not ``None`` the default value implied by `bias` is overridden. + Note that ``ddof=1`` will return the unbiased estimate, even if both + `fweights` and `aweights` are specified, and ``ddof=0`` will return + the simple average. The default value is ``None``. + fweights : array_like, int, optional + 1-D array of integer frequency weights; the number of times each + observation vector should be repeated. + aweights : array_like, optional + 1-D array of observation vector weights. These relative weights are + typically large for observations considered "important" and smaller for + observations considered less "important". If ``ddof=0`` the array of + weights can be used to assign probabilities to observation vectors. + dtype : data-type, optional + Data-type of the result. By default, the return data-type will have + at least `float64` precision. + + Returns + ------- + out : ndarray + The covariance matrix of the variables. + + See Also + -------- + numpy.cov + + Availability + -------- + Multiple GPUs, Multiple CPUs + """ + # Check inputs + if ddof is not None and type(ddof) is not int: + raise ValueError("ddof must be integer") + + # Handles complex arrays too + if m.ndim > 2: + raise ValueError("m has more than 2 dimensions") + + if y is not None and y.ndim > 2: + raise ValueError("y has more than 2 dimensions") + + if dtype is None: + if y is None: + dtype = np.result_type(m.dtype, np.float64) + else: + dtype = np.result_type(m.dtype, y.dtype, np.float64) + + X = array(m, ndmin=2, dtype=dtype) + if not rowvar and X.shape[0] != 1: + X = X.T + if X.shape[0] == 0: + return empty((0, 0)) + if y is not None: + y = array(y, copy=False, ndmin=2, dtype=dtype) + if not rowvar and y.shape[0] != 1: + y = y.T + # TODO(mpapadakis): Could have saved on an intermediate copy of X in + # this case, if it was already of the right shape. + X = concatenate((X, y), axis=0) + + if ddof is None: + if not bias: + ddof = 1 + else: + ddof = 0 + + # Get the product of frequencies and weights + w: Union[ndarray, None] = None + if fweights is not None: + if fweights.ndim > 1: + raise RuntimeError("cannot handle multidimensional fweights") + if fweights.shape[0] != X.shape[1]: + raise RuntimeError("incompatible numbers of samples and fweights") + if any(fweights < 0): + raise ValueError("fweights cannot be negative") + w = fweights + if aweights is not None: + if aweights.ndim > 1: + raise RuntimeError("cannot handle multidimensional aweights") + if aweights.shape[0] != X.shape[1]: + raise RuntimeError("incompatible numbers of samples and aweights") + if any(aweights < 0): + raise ValueError("aweights cannot be negative") + if w is None: + w = aweights + else: + # Cannot be done in-place with *= when aweights.dtype != w.dtype + w = w * aweights + + # TODO upon merge of average() replace this code + w_sum: Union[ndarray, int] = 0 + if w is None: + avg = mean(X, axis=1) + w_sum = X.shape[1] + else: + w_sum = sum(w) + avg = sum(X * w, axis=1) / w_sum + + # Determine the normalization + fact: Union[ndarray, float] = 0.0 + if w is None: + fact = X.shape[1] - ddof + elif ddof == 0: + fact = w_sum + elif aweights is None: + fact = w_sum - ddof + else: + fact = w_sum - ddof * sum(w * aweights) / w_sum + + fact = clip(fact, 0.0, None) + + X -= avg[:, None] + if w is None: + X_T = X.T + else: + X_T = (X * w).T + c = dot(X, X_T.conj()) + # Cannot be done in-place with /= when the dtypes differ + c = c / fact + return c.squeeze() + + # Histograms diff --git a/docs/cunumeric/source/api/statistics.rst b/docs/cunumeric/source/api/statistics.rst index c0caf8493..e0396227c 100644 --- a/docs/cunumeric/source/api/statistics.rst +++ b/docs/cunumeric/source/api/statistics.rst @@ -12,6 +12,7 @@ Averages and variances mean nanmean var + cov Histograms diff --git a/tests/integration/test_stats.py b/tests/integration/test_stats.py index 256a6b77f..c598a7899 100644 --- a/tests/integration/test_stats.py +++ b/tests/integration/test_stats.py @@ -215,6 +215,102 @@ def test_var_xfail(dtype, ddof, axis, shape): check_op(op_np, op_num, np_in, dtype, negative_test=True) +@pytest.mark.parametrize("dtype", dtypes) +@pytest.mark.parametrize("rowvar", [True, False]) +@pytest.mark.parametrize("ddof", [None, 0, 1]) +def test_cov(dtype, rowvar, ddof): + np_in = get_op_input(astype=dtype) + num_in = num.array(np_in) + + np_out = np.cov(np_in, rowvar=rowvar, ddof=ddof) + num_out = num.cov(num_in, rowvar=rowvar, ddof=ddof) + if dtype == dtypes[0]: + assert allclose(np_out, num_out, atol=1e-2) + else: + assert allclose(np_out, num_out) + + +fweights_base = [[9, 2, 1, 2, 3], [1, 1, 3, 2, 4], None] +np_aweights_base = [ + np.abs(get_op_input(astype=dtype, shape=(5,))) for dtype in dtypes +] + [[0.03, 0.04, 01.01, 0.02, 0.08], None] + + +@pytest.mark.parametrize("dtype", dtypes) +@pytest.mark.parametrize("bias", [True, False]) +@pytest.mark.parametrize("ddof", [None, 0, 1]) +@pytest.mark.parametrize("fweights", fweights_base) +@pytest.mark.parametrize("np_aweights", np_aweights_base) +def test_cov_full(dtype, bias, ddof, fweights, np_aweights): + np_in = get_op_input(astype=dtype, shape=(4, 5)) + num_in = num.array(np_in) + if fweights is not None: + np_fweights = np.array(fweights) + num_fweights = num.array(fweights) + else: + np_fweights = None + num_fweights = None + if isinstance(np_aweights, np.ndarray): + num_aweights = num.array(np_aweights) + else: + num_aweights = np_aweights + # num_aweights = None + # np_aweights = None + + np_out = np.cov( + np_in, bias=bias, ddof=ddof, fweights=np_fweights, aweights=np_aweights + ) + num_out = num.cov( + num_in, + bias=bias, + ddof=ddof, + fweights=num_fweights, + aweights=num_aweights, + ) + # if dtype == dtypes[0]: + # assert allclose(np_out, num_out, atol=1e-2) + # else: + # assert allclose(np_out, num_out) + assert allclose(np_out, num_out, atol=1e-2) + + +@pytest.mark.parametrize("ddof", [None, 0, 1]) +@pytest.mark.parametrize("fweights", fweights_base) +@pytest.mark.parametrize("np_aweights", np_aweights_base) +def test_cov_dtype_scaling(ddof, fweights, np_aweights): + np_in = np.array( + [ + [1 + 3j, 1 - 1j, 2 + 2j, 4 + 3j, -1 + 2j], + [1 + 3j, 1 - 1j, 2 + 2j, 4 + 3j, -1 + 2j], + ] + ) + num_in = num.array(np_in) + if fweights is not None: + np_fweights = np.array(fweights) + num_fweights = num.array(fweights) + else: + np_fweights = None + num_fweights = None + if isinstance(np_aweights, np.ndarray): + num_aweights = num.array(np_aweights) + else: + num_aweights = np_aweights + + np_out = np.cov( + np_in, + ddof=ddof, + fweights=np_fweights, + aweights=np_aweights, + ) + num_out = num.cov( + num_in, + ddof=ddof, + fweights=num_fweights, + aweights=num_aweights, + ) + assert allclose(np_out, num_out, atol=1e-2) + + if __name__ == "__main__": import sys