From 4260e111a0d0bb0e85a333e65b503ea1a9e7d9fa Mon Sep 17 00:00:00 2001 From: Felix Held Date: Tue, 21 May 2024 18:53:47 +0200 Subject: [PATCH] control all randomness --- README.ipynb | 134 +++++++++++++++++++++++++++++++++++++++++---------- README.md | 109 +++++++++++++++++------------------------ 2 files changed, 151 insertions(+), 92 deletions(-) diff --git a/README.ipynb b/README.ipynb index 81547c0..c396ec6 100644 --- a/README.ipynb +++ b/README.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ "xs_sim = solrcmf.simulate(\n", " viewdims={0: 100, 1: 50, 2: 50},\n", " factor_scales={\n", - " (0, 1): [7.0, 4.5, 3.9, 0.0, 0.0],\n", + " (0, 1): [7.0, 5.1, 4.6, 0.0, 0.0],\n", " (0, 2): [8.3, 0.0, 0.0, 5.5, 0.0],\n", " (1, 2, 0): [6.3, 0.0, 4.7, 0.0, 5.1],\n", " (1, 2, 1): [0.0, 8.6, 4.9, 0.0, 0.0],\n", @@ -81,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -107,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -149,11 +149,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ - "est.fit(xs_scaled, vs=est_init.vs_, ds=est_init.ds_, us=est_init.vs_)" + "est.fit(xs_scaled, vs=est_init.vs_, ds=est_init.ds_, us=est_init.vs_);" ] }, { @@ -167,9 +167,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 130, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 1) : [-0.00 2.34 -0.00 -0.00 -4.17 0.00 0.00 0.00 0.00 6.52]\n", + "(0, 2) : [ 0.00 -0.00 3.44 0.00 0.00 -0.00 -0.00 0.00 -0.00 -7.53]\n", + "(1, 2, 0) : [ 0.00 -3.21 -0.00 0.00 0.00 0.00 0.00 -0.00 -2.88 -5.67]\n", + "(1, 2, 1) : [ 0.00 -3.74 0.05 -0.00 -8.22 -0.00 0.00 -0.00 0.00 0.00]\n" + ] + } + ], "source": [ "for k, d in est.ds_.items():\n", " rescaled_d = d * np.sqrt((xs_centered[k] ** 2).sum())\n", @@ -199,20 +210,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ "# Lists of structure and factor penalties are supplied containing the\n", "# parameter combinations to be tested. Lists need to be of the same length\n", "# or one needs to be a scalar.\n", - "# - `cv` number of folds\n", + "# - `cv` number of folds as an integer or an object of\n", + "# class `lscmf.ElementwiseFolds`. The latter is also used internally if only\n", + "# an integer is provided, however, it allows specification of a random\n", + "# number generator and whether or not inputs should be shuffled\n", + "# before splitting.\n", "est_cv = solrcmf.SolrCMFCV(\n", " max_rank=10,\n", " structure_penalty=np.exp(rng.uniform(np.log(5e-2), np.log(1.0), 100)),\n", " factor_penalty=np.exp(rng.uniform(np.log(5e-2), np.log(1.0), 100)),\n", " mu=10,\n", - " cv=10,\n", + " cv=solrcmf.ElementwiseFolds(10, rng=rng),\n", " init=\"custom\",\n", " max_iter=100000,\n", " n_jobs=4,\n", @@ -228,14 +243,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ "# Initial values are supplied as lists. If length 1 then they are reused.\n", "# If same length as hyperparameters then different initial values can be used\n", "# for each pair of hyperparameters.\n", - "est_cv.fit(xs_scaled, vs=[est_init.vs_], ds=[est_init.ds_], us=[est_init.vs_])" + "est_cv.fit(xs_scaled, vs=[est_init.vs_], ds=[est_init.ds_], us=[est_init.vs_]);" ] }, { @@ -247,9 +262,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 133, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "structure_penalty 0.114250\n", + "max_rank 10.000000\n", + "factor_penalty 0.058822\n", + "objective_value_penalized 2.014111\n", + "mean_elapsed_process_time_penalized 7.391883\n", + "std_elapsed_process_time_penalized 0.000000\n", + "est_max_rank 5.000000\n", + "structural_zeros 30.000000\n", + "factor_zeros 1748.000000\n", + "neg_mean_squared_error_fold0 -0.000191\n", + "neg_mean_squared_error_fold1 -0.000189\n", + "neg_mean_squared_error_fold2 -0.000196\n", + "neg_mean_squared_error_fold3 -0.000169\n", + "neg_mean_squared_error_fold4 -0.000199\n", + "neg_mean_squared_error_fold5 -0.000188\n", + "neg_mean_squared_error_fold6 -0.000184\n", + "neg_mean_squared_error_fold7 -0.000185\n", + "neg_mean_squared_error_fold8 -0.000191\n", + "neg_mean_squared_error_fold9 -0.000181\n", + "mean_elapsed_process_time_fixed 1.239766\n", + "std_elapsed_process_time_fixed 0.132469\n", + "mean_neg_mean_squared_error -0.000187\n", + "std_neg_mean_squared_error 0.000008\n", + "Name: 76, dtype: float64" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", @@ -259,9 +308,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 134, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(0, 1) : [ 3.92 -0.00 -5.24 0.00 7.43]\n", + "(0, 2) : [-0.00 5.06 0.00 -0.00 -8.45]\n", + "(1, 2, 0) : [-4.26 -0.00 0.00 -4.17 -6.58]\n", + "(1, 2, 1) : [-4.84 0.00 -9.23 0.00 0.00]\n" + ] + } + ], "source": [ "for k, d in est_cv.best_estimator_.ds_.items():\n", " rescaled_d = d * np.sqrt((xs_centered[k] ** 2).sum())\n", @@ -277,16 +337,27 @@ "source": [ "Due to the small size of the data sources and signal-to-noise ratio of 0.5, it is not possible to recover singular values perfectly. However, thanks to unpenalized re-estimation, the strong shrinkage seen in the manual solution above is not present here.\n", "\n", - "The factor estimates are in `est_cv.best_estimator_.vs_`, however, sparse factors can be found in `est_cv.best_estimator_.us_`. In this particular run, factor 0 of view 0 in the groundtruth corresponds to factor 0 in view 0 of the estimate. Note that in general factor order is arbitrary." + "The factor estimates are in `est_cv.best_estimator_.vs_`, however, sparse factors can be found in `est_cv.best_estimator_.us_`. In this particular run, factor 1 of view 0 in the groundtruth corresponds to factor 5 in view 0 of the estimate. Note that in general factor order is arbitrary." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 138, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "-0.9878174758052286" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "np.sum(xs_sim[\"vs\"][0][:, 0] * est_cv.best_estimator_.us_[0][:, 0])" + "np.sum(xs_sim[\"vs\"][0][:, 0] * est_cv.best_estimator_.us_[0][:, 4])" ] }, { @@ -298,7 +369,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ @@ -316,16 +387,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 139, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(0.6410256410256411, 0.0)" + ] + }, + "execution_count": 139, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "(\n", " true_positive_rate(\n", - " xs_sim[\"vs\"][0][:, 0], est_cv.best_estimator_.us_[0][:, 0]\n", + " xs_sim[\"vs\"][0][:, 0], est_cv.best_estimator_.us_[0][:, 4]\n", " ),\n", " false_positive_rate(\n", - " xs_sim[\"vs\"][0][:, 0], est_cv.best_estimator_.us_[0][:, 0]\n", + " xs_sim[\"vs\"][0][:, 0], est_cv.best_estimator_.us_[0][:, 4]\n", " ),\n", ")" ] diff --git a/README.md b/README.md index 94322ce..4f7d0e5 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ rng = default_rng(42) xs_sim = solrcmf.simulate( viewdims={0: 100, 1: 50, 2: 50}, factor_scales={ - (0, 1): [7.0, 4.5, 3.9, 0.0, 0.0], + (0, 1): [7.0, 5.1, 4.6, 0.0, 0.0], (0, 2): [8.3, 0.0, 0.0, 5.5, 0.0], (1, 2, 0): [6.3, 0.0, 4.7, 0.0, 5.1], (1, 2, 1): [0.0, 8.6, 4.9, 0.0, 0.0], @@ -120,17 +120,9 @@ final values of the initial runs as starting values. Penalty parameters are not ```python -est.fit(xs_scaled, vs=est_init.vs_, ds=est_init.ds_, us=est_init.vs_) +est.fit(xs_scaled, vs=est_init.vs_, ds=est_init.ds_, us=est_init.vs_); ``` - - - - SolrCMF(factor_penalty=0.08, factor_pruning=False, init='custom', - max_iter=100000, max_rank=10, mu=10, structure_penalty=0.05) - - - Estimates for $D_{ij}$ are then in `est.ds_` and estimates for $V_i$ in `est.vs_`. Scale back to original scale. @@ -145,10 +137,10 @@ for k, d in est.ds_.items(): ) ``` - (0, 1) : [ 0.00 6.55 -1.44 3.52 0.00 -0.00 -0.00 -0.00 0.00 -0.00] - (0, 2) : [ 0.00 -7.52 -0.00 -0.00 -2.88 -0.00 0.00 -0.00 0.00 0.00] - (1, 2, 0) : [ 0.00 -5.67 2.92 0.00 -0.00 -0.00 -2.81 -0.00 -0.00 0.00] - (1, 2, 1) : [-0.00 0.00 3.64 -8.23 0.00 -0.00 0.00 0.00 0.00 0.00] + (0, 1) : [-0.00 2.34 -0.00 -0.00 -4.17 0.00 0.00 0.00 0.00 6.52] + (0, 2) : [ 0.00 -0.00 3.44 0.00 0.00 -0.00 -0.00 0.00 -0.00 -7.53] + (1, 2, 0) : [ 0.00 -3.21 -0.00 0.00 0.00 0.00 0.00 -0.00 -2.88 -5.67] + (1, 2, 1) : [ 0.00 -3.74 0.05 -0.00 -8.22 -0.00 0.00 -0.00 0.00 0.00] Shrinkage can be clearly seen in the singular value estimates compared to the groundtruth. @@ -169,13 +161,17 @@ The final solution is found by determining the pair of hyperparameters that lead # Lists of structure and factor penalties are supplied containing the # parameter combinations to be tested. Lists need to be of the same length # or one needs to be a scalar. -# - `cv` number of folds +# - `cv` number of folds as an integer or an object of +# class `lscmf.ElementwiseFolds`. The latter is also used internally if only +# an integer is provided, however, it allows specification of a random +# number generator and whether or not inputs should be shuffled +# before splitting. est_cv = solrcmf.SolrCMFCV( max_rank=10, structure_penalty=np.exp(rng.uniform(np.log(5e-2), np.log(1.0), 100)), factor_penalty=np.exp(rng.uniform(np.log(5e-2), np.log(1.0), 100)), mu=10, - cv=10, + cv=solrcmf.ElementwiseFolds(10, rng=rng), init="custom", max_iter=100000, n_jobs=4, @@ -189,28 +185,9 @@ Perform hyperparameter selection. This step can be time-intensive. # Initial values are supplied as lists. If length 1 then they are reused. # If same length as hyperparameters then different initial values can be used # for each pair of hyperparameters. -est_cv.fit(xs_scaled, vs=[est_init.vs_], ds=[est_init.ds_], us=[est_init.vs_]) +est_cv.fit(xs_scaled, vs=[est_init.vs_], ds=[est_init.ds_], us=[est_init.vs_]); ``` - - - - SolrCMFCV(factor_penalty=array([0.36792423, 0.06941587, 0.10752516, 0.43376266, 0.53883355, - 0.95854522, 0.10676321, 0.62188201, 0.08373001, 0.07400256, - 0.44430964, 0.19013555, 0.53937187, 0.11657228, 0.09469818, - 0.25880558, 0.07063052, 0.11524739, 0.08227149, 0.13042509, - 0.49534386, 0.07178925, 0.3126294 , 0.28824926, 0.25066523, - 0.2132999 , 0.36531926, 0.64150673, 0.08124273, 0... - 0.71015324, 0.1375872 , 0.10718306, 0.73360059, 0.0774336 , - 0.05972806, 0.12817684, 0.48768923, 0.40007808, 0.96196336, - 0.14680267, 0.11424985, 0.15524923, 0.52084544, 0.09501248, - 0.85510326, 0.23217319, 0.52223399, 0.59602222, 0.2098567 , - 0.46080418, 0.14908991, 0.56755986, 0.59005505, 0.27265958, - 0.09611405, 0.91465952, 0.85313787, 0.32016594, 0.95285913, - 0.22548781, 0.15398784, 0.19865442, 0.05737153, 0.25905621])) - - - CV results can be found in the attribute `est_cv.cv_results_` and can be easily converted to a Pandas `DataFrame`. The best result corresponds to the row with index `est_cv.best_index_`. @@ -224,30 +201,30 @@ cv_res.loc[est_cv.best_index_, :] - structure_penalty 0.062040 + structure_penalty 0.114250 max_rank 10.000000 - factor_penalty 0.069416 - objective_value_penalized 1.934370 - mean_elapsed_process_time_penalized 6.977537 + factor_penalty 0.058822 + objective_value_penalized 2.014111 + mean_elapsed_process_time_penalized 7.679406 std_elapsed_process_time_penalized 0.000000 est_max_rank 5.000000 structural_zeros 30.000000 - factor_zeros 1746.000000 - neg_mean_squared_error_fold0 -0.000193 - neg_mean_squared_error_fold1 -0.000179 - neg_mean_squared_error_fold2 -0.000181 - neg_mean_squared_error_fold3 -0.000185 - neg_mean_squared_error_fold4 -0.000189 - neg_mean_squared_error_fold5 -0.000184 - neg_mean_squared_error_fold6 -0.000190 - neg_mean_squared_error_fold7 -0.000184 - neg_mean_squared_error_fold8 -0.000182 - neg_mean_squared_error_fold9 -0.000189 - mean_elapsed_process_time_fixed 1.265778 - std_elapsed_process_time_fixed 0.094385 - mean_neg_mean_squared_error -0.000186 - std_neg_mean_squared_error 0.000004 - Name: 1, dtype: float64 + factor_zeros 1748.000000 + neg_mean_squared_error_fold0 -0.000191 + neg_mean_squared_error_fold1 -0.000189 + neg_mean_squared_error_fold2 -0.000196 + neg_mean_squared_error_fold3 -0.000169 + neg_mean_squared_error_fold4 -0.000199 + neg_mean_squared_error_fold5 -0.000188 + neg_mean_squared_error_fold6 -0.000184 + neg_mean_squared_error_fold7 -0.000185 + neg_mean_squared_error_fold8 -0.000191 + neg_mean_squared_error_fold9 -0.000181 + mean_elapsed_process_time_fixed 1.367407 + std_elapsed_process_time_fixed 0.169312 + mean_neg_mean_squared_error -0.000187 + std_neg_mean_squared_error 0.000008 + Name: 76, dtype: float64 @@ -261,25 +238,25 @@ for k, d in est_cv.best_estimator_.ds_.items(): ) ``` - (0, 1) : [ 7.40 -3.27 4.59 0.00 -0.00] - (0, 2) : [-8.43 -0.00 -0.00 -5.15 0.00] - (1, 2, 0) : [-6.60 4.34 0.00 -0.00 -4.49] - (1, 2, 1) : [ 0.00 4.82 -9.21 0.00 0.00] + (0, 1) : [ 3.92 -0.00 -5.24 0.00 7.43] + (0, 2) : [-0.00 5.06 0.00 -0.00 -8.45] + (1, 2, 0) : [-4.26 -0.00 0.00 -4.17 -6.58] + (1, 2, 1) : [-4.84 0.00 -9.23 0.00 0.00] Due to the small size of the data sources and signal-to-noise ratio of 0.5, it is not possible to recover singular values perfectly. However, thanks to unpenalized re-estimation, the strong shrinkage seen in the manual solution above is not present here. -The factor estimates are in `est_cv.best_estimator_.vs_`, however, sparse factors can be found in `est_cv.best_estimator_.us_`. In this particular run, factor 0 of view 0 in the groundtruth corresponds to factor 0 in view 0 of the estimate. Note that in general factor order is arbitrary. +The factor estimates are in `est_cv.best_estimator_.vs_`, however, sparse factors can be found in `est_cv.best_estimator_.us_`. In this particular run, factor 1 of view 0 in the groundtruth corresponds to factor 5 in view 0 of the estimate. Note that in general factor order is arbitrary. ```python -np.sum(xs_sim["vs"][0][:, 0] * est_cv.best_estimator_.us_[0][:, 0]) +np.sum(xs_sim["vs"][0][:, 0] * est_cv.best_estimator_.us_[0][:, 4]) ``` - 0.9886157937798741 + -0.9878174758052286 @@ -303,10 +280,10 @@ def false_positive_rate(estimate, truth): ```python ( true_positive_rate( - xs_sim["vs"][0][:, 0], est_cv.best_estimator_.us_[0][:, 0] + xs_sim["vs"][0][:, 0], est_cv.best_estimator_.us_[0][:, 4] ), false_positive_rate( - xs_sim["vs"][0][:, 0], est_cv.best_estimator_.us_[0][:, 0] + xs_sim["vs"][0][:, 0], est_cv.best_estimator_.us_[0][:, 4] ), ) ``` @@ -314,6 +291,6 @@ def false_positive_rate(estimate, truth): - (0.6578947368421053, 0.0) + (0.6410256410256411, 0.0)