Skip to content

Commit

Permalink
Updating regression requirements (#81)
Browse files Browse the repository at this point in the history
* Fitting the regression line for cerebellum group and SST cell type was using only 1 datapoint, and producing erroneous results. Now, the requirement for fitting the regression coefficient is have a minimum of 6 datapoints.
* Added some debug statement for logging, which helps in checking how well the regression ran.
* Changed how compute_region_volumes function in atlas_densities.densities.utils is calculating the "id_volume" column. Earlier it was generating nan values which leads to errors downstream in the pipeline.

* make `min_data_points` a command like parameter for fit-average-densities

---------

Co-authored-by: Pranav Rai <[email protected]>
  • Loading branch information
mgeplf and rai-pranav authored Jun 20, 2024
1 parent e925871 commit a99649a
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 16 deletions.
30 changes: 30 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,11 +1,41 @@
Changelog
=========

Version 0.2.6
-------------
* Updating regression requirements, teach
`atlas-densities cell-densities fit-average-densities` to have `--min-data-points` (#81)

Version 0.2.5
-------------

* Fix nans in compute_region_volumes (#80)

Version 0.2.4
-------------

* Add r_square values to the results of the fitting. (#68)
* Add check to the region filter in case the input dataset is all equal to 0 for the region of interest. (#74)
* Fix fitting and optimization steps (#75)
* Faster _compute_region_cell_counts (#72)
* Cleanup app tests (#78)
* Faster compute average intensities (#73)
* Speedup mtype density creation from probability maps using voxcell's ValueToIndexVoxels (#79)

Version 0.2.3
-------------

* remove pkg_resources, use importlib (#62)
* Drop literature values from regions that are not in hierarchy or not in the annotation volume (#61)

Version 0.2.2
-------------

* add ``atlas-densities combination manipulate``

Version 0.2.1
-------------

* The following *cell-density* sub-commands can now optionally take a ``--group-ids-config-path``:
*cell-density*, *glia-cell-densities*, *inhibitory-and-excitatory-neuron-densities*, *fit-average-densities*

Expand Down
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ respective standard deviation and `coefficient of determination`_ (`r_square`).
--fitted-densities-output-path=data/ccfv2/first_estimates/first_estimates.csv \
--fitting-maps-output-path=data/ccfv2/first_estimates/fitting.json
Note: One can use the ``--min-data-points`` to require a minimum of points for the linear regression; the default is 1.

Compute inhibitory/excitatory neuron densities
----------------------------------------------
Expand Down
9 changes: 9 additions & 0 deletions atlas_densities/app/cell_densities.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,13 @@ def measurements_to_average_densities(
help="Path to density groups ids config",
show_default=True,
)
@click.option(
"--min-data-points",
type=int,
default=1,
help="minimum number of datapoints required for running the linear regression.",
show_default=True,
)
@log_args(L)
def fit_average_densities(
hierarchy_path,
Expand All @@ -820,6 +827,7 @@ def fit_average_densities(
fitted_densities_output_path,
fitting_maps_output_path,
group_ids_config_path,
min_data_points,
): # pylint: disable=too-many-arguments, too-many-locals
"""
Estimate average cell densities of brain regions in `hierarchy_path` for the cell types
Expand Down Expand Up @@ -935,6 +943,7 @@ def fit_average_densities(
cell_density_stddev,
region_name=region_name,
group_ids_config=group_ids_config,
min_data_points=min_data_points,
)

# Turn index into column to ease off the save and load operations on csv files
Expand Down
41 changes: 33 additions & 8 deletions atlas_densities/densities/fitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,10 @@ def compute_average_intensities(


def linear_fitting_xy(
xdata: list[float], ydata: list[float], sigma: Union[list[float], FloatArray]
xdata: list[float],
ydata: list[float],
sigma: Union[list[float], FloatArray],
min_data_points: int,
) -> dict[str, float]:
"""
Compute the coefficient of the linear least-squares regression of the point cloud
Expand All @@ -380,6 +383,10 @@ def linear_fitting_xy(
These are the standard deviations of the values in `ydata`.
Zero values are replaced by the least positive value if it exists,
otherwise by 1.0.
min_data_points: minimum number of datapoints required for running
the linear regression. If the number of datapoints is less than
min_data_points then no fitting is done, and np.nan values are
returned.
Returns:
a dict of the form {"coefficient": <float>,
Expand All @@ -389,9 +396,8 @@ def linear_fitting_xy(
Raises:
AtlasDensitiesError if some of the `sigma`values are negative.
"""

if len(xdata) == 0:
return {"coefficient": np.nan, "standard_deviation": np.nan}
if len(xdata) < min_data_points:
return {"coefficient": np.nan, "standard_deviation": np.nan, "r_square": np.nan}

sigma = np.array(sigma)
if np.any(sigma < 0.0):
Expand Down Expand Up @@ -422,6 +428,7 @@ def _optimize_func(x, coefficient):
)
ss_reg = np.sum((_optimize_func(xdata, parameters[0][0]) - np.mean(ydata)) ** 2)
ss_tot = np.sum((np.array(ydata) - _optimize_func(xdata, parameters[0][0])) ** 2) + ss_reg
L.debug("Length of xdata = %s. The ss_reg is %s and ss_tot is %s", xdata, ss_reg, ss_tot)
# if total sum of square is null, no variance can be explained by the fitting
return {
"coefficient": parameters[0][0],
Expand All @@ -434,7 +441,10 @@ def _optimize_func(x, coefficient):


def compute_fitting_coefficients(
groups: dict[str, set[str]], average_intensities: pd.DataFrame, densities: pd.DataFrame
groups: dict[str, set[str]],
average_intensities: pd.DataFrame,
densities: pd.DataFrame,
min_data_points: int,
) -> FittingData:
"""
Compute the linear fitting coefficient of the cloud of 2D points (average marker intensity,
Expand Down Expand Up @@ -477,7 +487,7 @@ def compute_fitting_coefficients(
The "standard_deviation" value is the standard deviation of the coefficient value.
The "r_square" value is the coefficient of determination of the coefficient value.
"""

# pylint: disable=too-many-locals
if len(densities.index) != len(average_intensities.index) or np.any(
densities.index != average_intensities.index
):
Expand Down Expand Up @@ -545,8 +555,14 @@ def compute_fitting_coefficients(
L.info("Computing regression coefficients for %d cell types ...", len(cell_types))
for cell_type in tqdm(cell_types):
cloud = clouds[group_name][cell_type]
L.debug(
"The length of training data for %s and %s is %s",
group_name,
cell_type,
cloud["xdata"],
)
result[group_name][cell_type] = linear_fitting_xy(
cloud["xdata"], cloud["ydata"], cloud["sigma"]
cloud["xdata"], cloud["ydata"], cloud["sigma"], min_data_points
)
if np.isnan(result[group_name][cell_type]["coefficient"]):
warnings.warn(
Expand Down Expand Up @@ -730,6 +746,7 @@ def linear_fitting( # pylint: disable=too-many-arguments
cell_density_stddevs: Optional[dict[str, float]] = None,
region_name: str = "root",
group_ids_config: dict | None = None,
min_data_points: int = 5,
) -> pd.DataFrame:
"""
Estimate the average densities of every region in `region_map` using a linear fitting
Expand Down Expand Up @@ -772,6 +789,10 @@ def linear_fitting( # pylint: disable=too-many-arguments
standard deviations of average cell densities of the corresponding regions.
region_name: (str) name of the root region of interest
group_ids_config: mapping of regions to their constituent ids
min_data_points: minimum number of datapoints required for running
the linear regression. If the number of datapoints is less than
min_data_points then no fitting is done, and np.nan values are
returned.
Returns:
tuple (densities, fitting_coefficients)
Expand All @@ -782,6 +803,7 @@ def linear_fitting( # pylint: disable=too-many-arguments
fitting_coefficients: dict returned by
:fun:`atlas_densities.densities.fitting.compute_fitting_coefficients`.
"""
# pylint: disable=too-many-locals
assert group_ids_config is not None
L.info("Checking input data frames sanity ...")
_check_average_densities_sanity(average_densities)
Expand Down Expand Up @@ -833,7 +855,10 @@ def linear_fitting( # pylint: disable=too-many-arguments

L.info("Computing fitting coefficients ...")
fitting_coefficients = compute_fitting_coefficients(
groups, average_intensities, densities.drop(densities.index[indexes])
groups,
average_intensities,
densities.drop(densities.index[indexes]),
min_data_points=min_data_points,
)
L.info("Fitting unknown average densities ...")
fit_unknown_densities(groups, average_intensities, densities, fitting_coefficients)
Expand Down
1 change: 0 additions & 1 deletion atlas_densities/densities/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,5 +538,4 @@ def compute_region_volumes(
volume = result.loc[list(id_set), "id_volume"].sum()
volumes.append(volume)
result["volume"] = volumes

return result
40 changes: 33 additions & 7 deletions tests/densities/test_fitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,17 +273,24 @@ def test_compute_average_intensities(region_map, hierarchy_info):


def test_linear_fitting_xy():
actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0])
min_data_points = 1
actual = tested.linear_fitting_xy(
[0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0], min_data_points=min_data_points
)
assert np.allclose(actual["coefficient"], 2.0)
assert np.allclose(actual["r_square"], 1.0)
assert not np.isinf(actual["standard_deviation"])

actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5])
actual = tested.linear_fitting_xy(
[0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5], min_data_points=min_data_points
)
assert np.allclose(actual["coefficient"], 2.0)
assert not np.isinf(actual["standard_deviation"])
assert np.allclose(actual["r_square"], 0.89286)

actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0])
actual = tested.linear_fitting_xy(
[0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0], min_data_points=min_data_points
)
assert np.allclose(actual["coefficient"], 2.0)
assert not np.isinf(actual["standard_deviation"])
assert np.allclose(actual["r_square"], 1.0)
Expand Down Expand Up @@ -319,7 +326,10 @@ def test_compute_fitting_coefficients(hierarchy_info):
data = get_fitting_input_data_(hierarchy_info)

actual = tested.compute_fitting_coefficients(
data["groups"], data["intensities"], data["densities"]
data["groups"],
data["intensities"],
data["densities"],
min_data_points=1,
)

for group_name in ["Whole", "Central lobule"]:
Expand All @@ -340,18 +350,33 @@ def test_compute_fitting_coefficients_exceptions(hierarchy_info):
data["densities"].drop(index=["Central lobule"], inplace=True)

with pytest.raises(AtlasDensitiesError):
tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"])
tested.compute_fitting_coefficients(
data["groups"],
data["intensities"],
data["densities"],
min_data_points=1,
)

data = get_fitting_input_data_(hierarchy_info)
data["densities"].drop(columns=["pv+"], inplace=True)

with pytest.raises(AtlasDensitiesError):
tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"])
tested.compute_fitting_coefficients(
data["groups"],
data["intensities"],
data["densities"],
min_data_points=1,
)

data = get_fitting_input_data_(hierarchy_info)
data["densities"].at["Lobule II", "pv+_standard_deviation"] = np.nan
with pytest.raises(AssertionError):
tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"])
tested.compute_fitting_coefficients(
data["groups"],
data["intensities"],
data["densities"],
min_data_points=1,
)


@pytest.fixture
Expand Down Expand Up @@ -525,6 +550,7 @@ def test_linear_fitting(group_ids_config):
data["average_densities"],
data["homogenous_regions"],
group_ids_config=group_ids_config,
min_data_points=1,
)
warnings_ = [w for w in warnings_ if isinstance(w.message, AtlasDensitiesWarning)]
# Three warnings for recording NaN coefficients, three warnings for using them
Expand Down

0 comments on commit a99649a

Please sign in to comment.