Add r_square values to the results of the fitting. (#68)

BlueBrain · Feb 21, 2024 · 6e76e8c · 6e76e8c
1 parent 376c0a8
commit 6e76e8c
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 15 deletions.
diff --git a/README.rst b/README.rst
@@ -265,8 +265,11 @@ details). This step leverages AIBS ISH marker datasets (in their expression form
 literature density values.
 These transfer functions are used to obtain first estimates of neuron densities in regions not
 covered by literature.
-The result of the following command is a list of first density estimates for each neuron type and
-for each region of the annotation volume.
+
+The results of the following command includes a csv file (`first_estimates.csv`) storing the list
+of first density estimates for each neuron type and for each region of the annotation volume.
+The `fitting.json` output file contains the coefficients fitted by the algorithm together with their
+respective standard deviation and `coefficient of determination`_ (`r_square`).
 
 .. code-block:: bash
 
@@ -449,6 +452,7 @@ Copyright (c) 2022-2024 Blue Brain Project/EPFL
 .. _`mmc3.xlsx`: https://github.com/BlueBrain/atlas-densities/blob/main/atlas_densities/app/data/measurements/mmc3.xlsx
 .. _`gaba_papers.xlsx`: https://github.com/BlueBrain/atlas-densities/blob/main/atlas_densities/app/data/measurements/gaba_papers.xlsx
 .. _`mtypes_probability_map_config.yaml`: https://github.com/BlueBrain/atlas-densities/blob/main/atlas_densities/app/data/mtypes/mtypes_probability_map_config.yaml
+.. _`coefficient of determination`: https://en.wikipedia.org/wiki/Coefficient_of_determination
 
 .. substitutions
 

diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py
@@ -285,7 +285,9 @@ def linear_fitting_xy(
             otherwise by 1.0.
 
     Returns:
-        a dict of the form {"coefficient": <float>, "standard_deviation": <float>}.
+        a dict of the form {"coefficient": <float>,
+                            "standard_deviation": <float>,
+                            "r_square": <float>}.
 
     Raises:
        AtlasDensitiesError if some of the `sigma`values are negative.
@@ -311,15 +313,24 @@ def linear_fitting_xy(
         least_positive = np.min(sigma[~zero_mask])
         sigma[zero_mask] = least_positive / 2.0
 
+    def _optimize_func(x, coefficient):
+        return coefficient * np.array(x)
+
     parameters = curve_fit(
-        lambda x, coefficient: coefficient * x,
+        _optimize_func,
         xdata=xdata,
         ydata=ydata,
         sigma=sigma,
         absolute_sigma=True,
     )
-
-    return {"coefficient": parameters[0][0], "standard_deviation": np.sqrt(parameters[1][0][0])}
+    ss_reg = np.sum((_optimize_func(xdata, parameters[0][0]) - np.mean(ydata)) ** 2)
+    ss_tot = np.sum((np.array(ydata) - _optimize_func(xdata, parameters[0][0])) ** 2) + ss_reg
+    # if total sum of square is null, no variance can be explained by the fitting
+    return {
+        "coefficient": parameters[0][0],
+        "standard_deviation": np.sqrt(parameters[1][0][0]),
+        "r_square": (ss_reg / ss_tot) if ss_tot > 0 else np.nan,
+    }
 
 
 FittingData = Dict[str, Dict[str, Dict[str, float]]]
@@ -350,23 +361,24 @@ def compute_fitting_coefficients(
         dict of the form (values are fake):
         {
             "Isocortex" : {
-                "gad67": { "coefficient": 0.85, "standard_deviation": 0.15},
-                "pv": { "coefficient": 1.00, "standard_deviation": 0.5},
-                "sst": { "coefficient": 1.07, "standard_deviation": 0.75},
-                "vip": { "coefficient": 0.95, "standard_deviation": 0.15},
+                "gad67": { "coefficient": 0.85, "standard_deviation": 0.15, "r_square": 0.75},
+                "pv": { "coefficient": 1.00, "standard_deviation": 0.5, "r_square": 0.66},
+                "sst": { "coefficient": 1.07, "standard_deviation": 0.75, "r_square": 0.80},
+                "vip": { "coefficient": 0.95, "standard_deviation": 0.15, "r_square": 0.90},
             },
             "Cerebellum" : {
-                "gad67": { "coefficient": 0.75, "standard_deviation": 0.15},
-                "pv": { "coefficient": 1.10, "standard_deviation": 0.55},
-                "sst": { "coefficient": 1.20, "standard_deviation": 0.45},
-                "vip": { "coefficient": 0.95, "standard_deviation": 0.15},
+                "gad67": { "coefficient": 0.75, "standard_deviation": 0.15, "r_square": 0.55},
+                "pv": { "coefficient": 1.10, "standard_deviation": 0.55, "r_square": 0.40},
+                "sst": { "coefficient": 1.20, "standard_deviation": 0.45, "r_square": 0.58},
+                "vip": { "coefficient": 0.95, "standard_deviation": 0.15, "r_square": 0.77},
             },
            ...
         }
         The keys of the dict are the names of the region groups where separate fittings are
         performed. A "coefficient" value is the value of the linear regression coefficient for a
         given region group and a given cell type.
         The "standard_deviation" value is the standard deviation of the coefficient value.
+        The "r_square" value is the coefficient of determination of the coefficient value.
     """
 
     if len(densities.index) != len(average_intensities.index) or np.any(
@@ -387,7 +399,7 @@ def compute_fitting_coefficients(
 
     result = {
         group_name: {
-            cell_type: {"coefficient": np.nan, "standard_deviation": np.nan}
+            cell_type: {"coefficient": np.nan, "standard_deviation": np.nan, "r_square": np.nan}
             for cell_type in cell_types
         }
         for group_name in groups

diff --git a/tests/densities/test_fitting.py b/tests/densities/test_fitting.py
@@ -268,15 +268,18 @@ def test_compute_average_intensities(hierarchy_info):
 def test_linear_fitting_xy():
     actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0])
     assert np.allclose(actual["coefficient"], 2.0)
+    assert np.allclose(actual["r_square"], 1.0)
     assert not np.isinf(actual["standard_deviation"])
 
     actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5])
     assert np.allclose(actual["coefficient"], 2.0)
     assert not np.isinf(actual["standard_deviation"])
+    assert np.allclose(actual["r_square"], 0.89286)
 
     actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0])
     assert np.allclose(actual["coefficient"], 2.0)
     assert not np.isinf(actual["standard_deviation"])
+    assert np.allclose(actual["r_square"], 1.0)
 
 
 def get_fitting_input_data_(hierarchy_info):
@@ -319,6 +322,10 @@ def test_compute_fitting_coefficients(hierarchy_info):
         assert not np.isnan(actual[group_name]["gad67+"]["standard_deviation"])
         assert not np.isinf(actual[group_name]["pv+"]["standard_deviation"])
         assert not np.isnan(actual[group_name]["pv+"]["standard_deviation"])
+    assert actual["Whole"]["pv+"]["r_square"] == 1.0
+    assert actual["Whole"]["gad67+"]["r_square"] == 1.0
+    assert np.isnan(actual["Central lobule"]["pv+"]["r_square"])  # only one value so no variation.
+    assert np.isnan(actual["Central lobule"]["gad67+"]["r_square"])
 
 
 def test_compute_fitting_coefficients_exceptions(hierarchy_info):