control all randomness

cyianor · May 21, 2024 · 4260e11 · 4260e11
1 parent cbded1d
commit 4260e11
Show file tree

Hide file tree

Showing 2 changed files with 151 additions and 92 deletions.
diff --git a/README.ipynb b/README.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 126,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -55,7 +55,7 @@
     "xs_sim = solrcmf.simulate(\n",
     "    viewdims={0: 100, 1: 50, 2: 50},\n",
     "    factor_scales={\n",
-    "        (0, 1): [7.0, 4.5, 3.9, 0.0, 0.0],\n",
+    "        (0, 1): [7.0, 5.1, 4.6, 0.0, 0.0],\n",
     "        (0, 2): [8.3, 0.0, 0.0, 5.5, 0.0],\n",
     "        (1, 2, 0): [6.3, 0.0, 4.7, 0.0, 5.1],\n",
     "        (1, 2, 1): [0.0, 8.6, 4.9, 0.0, 0.0],\n",
@@ -81,7 +81,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 127,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -107,7 +107,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 128,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -149,11 +149,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 129,
    "metadata": {},
    "outputs": [],
    "source": [
-    "est.fit(xs_scaled, vs=est_init.vs_, ds=est_init.ds_, us=est_init.vs_)"
+    "est.fit(xs_scaled, vs=est_init.vs_, ds=est_init.ds_, us=est_init.vs_);"
    ]
   },
   {
@@ -167,9 +167,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 130,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0, 1)    : [-0.00  2.34 -0.00 -0.00 -4.17  0.00  0.00  0.00  0.00  6.52]\n",
+      "(0, 2)    : [ 0.00 -0.00  3.44  0.00  0.00 -0.00 -0.00  0.00 -0.00 -7.53]\n",
+      "(1, 2, 0) : [ 0.00 -3.21 -0.00  0.00  0.00  0.00  0.00 -0.00 -2.88 -5.67]\n",
+      "(1, 2, 1) : [ 0.00 -3.74  0.05 -0.00 -8.22 -0.00  0.00 -0.00  0.00  0.00]\n"
+     ]
+    }
+   ],
    "source": [
     "for k, d in est.ds_.items():\n",
     "    rescaled_d = d * np.sqrt((xs_centered[k] ** 2).sum())\n",
@@ -199,20 +210,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 131,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Lists of structure and factor penalties are supplied containing the\n",
     "# parameter combinations to be tested. Lists need to be of the same length\n",
     "# or one needs to be a scalar.\n",
-    "# - `cv` number of folds\n",
+    "# - `cv` number of folds as an integer or an object of\n",
+    "#   class `lscmf.ElementwiseFolds`. The latter is also used internally if only\n",
+    "#   an integer is provided, however, it allows specification of a random\n",
+    "#   number generator and whether or not inputs should be shuffled\n",
+    "#   before splitting.\n",
     "est_cv = solrcmf.SolrCMFCV(\n",
     "    max_rank=10,\n",
     "    structure_penalty=np.exp(rng.uniform(np.log(5e-2), np.log(1.0), 100)),\n",
     "    factor_penalty=np.exp(rng.uniform(np.log(5e-2), np.log(1.0), 100)),\n",
     "    mu=10,\n",
-    "    cv=10,\n",
+    "    cv=solrcmf.ElementwiseFolds(10, rng=rng),\n",
     "    init=\"custom\",\n",
     "    max_iter=100000,\n",
     "    n_jobs=4,\n",
@@ -228,14 +243,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 132,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Initial values are supplied as lists. If length 1 then they are reused.\n",
     "# If same length as hyperparameters then different initial values can be used\n",
     "# for each pair of hyperparameters.\n",
-    "est_cv.fit(xs_scaled, vs=[est_init.vs_], ds=[est_init.ds_], us=[est_init.vs_])"
+    "est_cv.fit(xs_scaled, vs=[est_init.vs_], ds=[est_init.ds_], us=[est_init.vs_]);"
    ]
   },
   {
@@ -247,9 +262,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 133,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "structure_penalty                         0.114250\n",
+       "max_rank                                 10.000000\n",
+       "factor_penalty                            0.058822\n",
+       "objective_value_penalized                 2.014111\n",
+       "mean_elapsed_process_time_penalized       7.391883\n",
+       "std_elapsed_process_time_penalized        0.000000\n",
+       "est_max_rank                              5.000000\n",
+       "structural_zeros                         30.000000\n",
+       "factor_zeros                           1748.000000\n",
+       "neg_mean_squared_error_fold0             -0.000191\n",
+       "neg_mean_squared_error_fold1             -0.000189\n",
+       "neg_mean_squared_error_fold2             -0.000196\n",
+       "neg_mean_squared_error_fold3             -0.000169\n",
+       "neg_mean_squared_error_fold4             -0.000199\n",
+       "neg_mean_squared_error_fold5             -0.000188\n",
+       "neg_mean_squared_error_fold6             -0.000184\n",
+       "neg_mean_squared_error_fold7             -0.000185\n",
+       "neg_mean_squared_error_fold8             -0.000191\n",
+       "neg_mean_squared_error_fold9             -0.000181\n",
+       "mean_elapsed_process_time_fixed           1.239766\n",
+       "std_elapsed_process_time_fixed            0.132469\n",
+       "mean_neg_mean_squared_error              -0.000187\n",
+       "std_neg_mean_squared_error                0.000008\n",
+       "Name: 76, dtype: float64"
+      ]
+     },
+     "execution_count": 133,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "import pandas as pd\n",
     "\n",
@@ -259,9 +308,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 134,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0, 1)    : [ 3.92 -0.00 -5.24  0.00  7.43]\n",
+      "(0, 2)    : [-0.00  5.06  0.00 -0.00 -8.45]\n",
+      "(1, 2, 0) : [-4.26 -0.00  0.00 -4.17 -6.58]\n",
+      "(1, 2, 1) : [-4.84  0.00 -9.23  0.00  0.00]\n"
+     ]
+    }
+   ],
    "source": [
     "for k, d in est_cv.best_estimator_.ds_.items():\n",
     "    rescaled_d = d * np.sqrt((xs_centered[k] ** 2).sum())\n",
@@ -277,16 +337,27 @@
    "source": [
     "Due to the small size of the data sources and signal-to-noise ratio of 0.5, it is not possible to recover singular values perfectly. However, thanks to unpenalized re-estimation, the strong shrinkage seen in the manual solution above is not present here.\n",
     "\n",
-    "The factor estimates are in `est_cv.best_estimator_.vs_`, however, sparse factors can be found in `est_cv.best_estimator_.us_`. In this particular run, factor 0 of view 0 in the groundtruth corresponds to factor 0 in view 0 of the estimate. Note that in general factor order is arbitrary."
+    "The factor estimates are in `est_cv.best_estimator_.vs_`, however, sparse factors can be found in `est_cv.best_estimator_.us_`. In this particular run, factor 1 of view 0 in the groundtruth corresponds to factor 5 in view 0 of the estimate. Note that in general factor order is arbitrary."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 138,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "-0.9878174758052286"
+      ]
+     },
+     "execution_count": 138,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "np.sum(xs_sim[\"vs\"][0][:, 0] * est_cv.best_estimator_.us_[0][:, 0])"
+    "np.sum(xs_sim[\"vs\"][0][:, 0] * est_cv.best_estimator_.us_[0][:, 4])"
    ]
   },
   {
@@ -298,7 +369,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 136,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -316,16 +387,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 139,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0.6410256410256411, 0.0)"
+      ]
+     },
+     "execution_count": 139,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "(\n",
     "    true_positive_rate(\n",
-    "        xs_sim[\"vs\"][0][:, 0], est_cv.best_estimator_.us_[0][:, 0]\n",
+    "        xs_sim[\"vs\"][0][:, 0], est_cv.best_estimator_.us_[0][:, 4]\n",
     "    ),\n",
     "    false_positive_rate(\n",
-    "        xs_sim[\"vs\"][0][:, 0], est_cv.best_estimator_.us_[0][:, 0]\n",
+    "        xs_sim[\"vs\"][0][:, 0], est_cv.best_estimator_.us_[0][:, 4]\n",
     "    ),\n",
     ")"
    ]