Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating regression requirements #81

Merged
merged 6 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,11 +1,41 @@
Changelog
=========

Version 0.2.6
-------------
* Updating regression requirements, teach
`atlas-densities cell-densities fit-average-densities` to have `--min-data-points` (#81)

Version 0.2.5
-------------

* Fix nans in compute_region_volumes (#80)

Version 0.2.4
-------------

* Add r_square values to the results of the fitting. (#68)
* Add check to the region filter in case the input dataset is all equal to 0 for the region of interest. (#74)
* Fix fitting and optimization steps (#75)
* Faster _compute_region_cell_counts (#72)
* Cleanup app tests (#78)
* Faster compute average intensities (#73)
* Speedup mtype density creation from probability maps using voxcell's ValueToIndexVoxels (#79)

Version 0.2.3
-------------

* remove pkg_resources, use importlib (#62)
* Drop literature values from regions that are not in hierarchy or not in the annotation volume (#61)

Version 0.2.2
-------------

* add ``atlas-densities combination manipulate``

Version 0.2.1
-------------

* The following *cell-density* sub-commands can now optionally take a ``--group-ids-config-path``:
*cell-density*, *glia-cell-densities*, *inhibitory-and-excitatory-neuron-densities*, *fit-average-densities*

Expand Down
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ respective standard deviation and `coefficient of determination`_ (`r_square`).
--fitted-densities-output-path=data/ccfv2/first_estimates/first_estimates.csv \
--fitting-maps-output-path=data/ccfv2/first_estimates/fitting.json

Note: One can use the ``--min-data-points`` to require a minimum of points for the linear regression; the default is 1.

Compute inhibitory/excitatory neuron densities
----------------------------------------------
Expand Down
9 changes: 9 additions & 0 deletions atlas_densities/app/cell_densities.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,13 @@ def measurements_to_average_densities(
help="Path to density groups ids config",
show_default=True,
)
@click.option(
"--min-data-points",
type=int,
default=1,
help="minimum number of datapoints required for running the linear regression.",
show_default=True,
)
@log_args(L)
def fit_average_densities(
hierarchy_path,
Expand All @@ -820,6 +827,7 @@ def fit_average_densities(
fitted_densities_output_path,
fitting_maps_output_path,
group_ids_config_path,
min_data_points,
): # pylint: disable=too-many-arguments, too-many-locals
"""
Estimate average cell densities of brain regions in `hierarchy_path` for the cell types
Expand Down Expand Up @@ -935,6 +943,7 @@ def fit_average_densities(
cell_density_stddev,
region_name=region_name,
group_ids_config=group_ids_config,
min_data_points=min_data_points,
)

# Turn index into column to ease off the save and load operations on csv files
Expand Down
41 changes: 33 additions & 8 deletions atlas_densities/densities/fitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,10 @@ def compute_average_intensities(


def linear_fitting_xy(
xdata: list[float], ydata: list[float], sigma: Union[list[float], FloatArray]
xdata: list[float],
ydata: list[float],
sigma: Union[list[float], FloatArray],
min_data_points: int,
) -> dict[str, float]:
"""
Compute the coefficient of the linear least-squares regression of the point cloud
Expand All @@ -380,6 +383,10 @@ def linear_fitting_xy(
These are the standard deviations of the values in `ydata`.
Zero values are replaced by the least positive value if it exists,
otherwise by 1.0.
min_data_points: minimum number of datapoints required for running
the linear regression. If the number of datapoints is less than
min_data_points then no fitting is done, and np.nan values are
returned.

Returns:
a dict of the form {"coefficient": <float>,
Expand All @@ -389,9 +396,8 @@ def linear_fitting_xy(
Raises:
AtlasDensitiesError if some of the `sigma`values are negative.
"""

if len(xdata) == 0:
return {"coefficient": np.nan, "standard_deviation": np.nan}
if len(xdata) < min_data_points:
return {"coefficient": np.nan, "standard_deviation": np.nan, "r_square": np.nan}

sigma = np.array(sigma)
if np.any(sigma < 0.0):
Expand Down Expand Up @@ -422,6 +428,7 @@ def _optimize_func(x, coefficient):
)
ss_reg = np.sum((_optimize_func(xdata, parameters[0][0]) - np.mean(ydata)) ** 2)
ss_tot = np.sum((np.array(ydata) - _optimize_func(xdata, parameters[0][0])) ** 2) + ss_reg
L.debug("Length of xdata = %s. The ss_reg is %s and ss_tot is %s", xdata, ss_reg, ss_tot)
# if total sum of square is null, no variance can be explained by the fitting
return {
"coefficient": parameters[0][0],
Expand All @@ -434,7 +441,10 @@ def _optimize_func(x, coefficient):


def compute_fitting_coefficients(
groups: dict[str, set[str]], average_intensities: pd.DataFrame, densities: pd.DataFrame
groups: dict[str, set[str]],
average_intensities: pd.DataFrame,
densities: pd.DataFrame,
min_data_points: int,
) -> FittingData:
"""
Compute the linear fitting coefficient of the cloud of 2D points (average marker intensity,
Expand Down Expand Up @@ -477,7 +487,7 @@ def compute_fitting_coefficients(
The "standard_deviation" value is the standard deviation of the coefficient value.
The "r_square" value is the coefficient of determination of the coefficient value.
"""

# pylint: disable=too-many-locals
if len(densities.index) != len(average_intensities.index) or np.any(
densities.index != average_intensities.index
):
Expand Down Expand Up @@ -545,8 +555,14 @@ def compute_fitting_coefficients(
L.info("Computing regression coefficients for %d cell types ...", len(cell_types))
for cell_type in tqdm(cell_types):
cloud = clouds[group_name][cell_type]
L.debug(
"The length of training data for %s and %s is %s",
group_name,
cell_type,
cloud["xdata"],
)
result[group_name][cell_type] = linear_fitting_xy(
cloud["xdata"], cloud["ydata"], cloud["sigma"]
cloud["xdata"], cloud["ydata"], cloud["sigma"], min_data_points
)
if np.isnan(result[group_name][cell_type]["coefficient"]):
warnings.warn(
Expand Down Expand Up @@ -730,6 +746,7 @@ def linear_fitting( # pylint: disable=too-many-arguments
cell_density_stddevs: Optional[dict[str, float]] = None,
region_name: str = "root",
group_ids_config: dict | None = None,
min_data_points: int = 5,
) -> pd.DataFrame:
"""
Estimate the average densities of every region in `region_map` using a linear fitting
Expand Down Expand Up @@ -772,6 +789,10 @@ def linear_fitting( # pylint: disable=too-many-arguments
standard deviations of average cell densities of the corresponding regions.
region_name: (str) name of the root region of interest
group_ids_config: mapping of regions to their constituent ids
min_data_points: minimum number of datapoints required for running
the linear regression. If the number of datapoints is less than
min_data_points then no fitting is done, and np.nan values are
returned.

Returns:
tuple (densities, fitting_coefficients)
Expand All @@ -782,6 +803,7 @@ def linear_fitting( # pylint: disable=too-many-arguments
fitting_coefficients: dict returned by
:fun:`atlas_densities.densities.fitting.compute_fitting_coefficients`.
"""
# pylint: disable=too-many-locals
assert group_ids_config is not None
L.info("Checking input data frames sanity ...")
_check_average_densities_sanity(average_densities)
Expand Down Expand Up @@ -833,7 +855,10 @@ def linear_fitting( # pylint: disable=too-many-arguments

L.info("Computing fitting coefficients ...")
fitting_coefficients = compute_fitting_coefficients(
groups, average_intensities, densities.drop(densities.index[indexes])
groups,
average_intensities,
densities.drop(densities.index[indexes]),
min_data_points=min_data_points,
)
L.info("Fitting unknown average densities ...")
fit_unknown_densities(groups, average_intensities, densities, fitting_coefficients)
Expand Down
1 change: 0 additions & 1 deletion atlas_densities/densities/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,5 +538,4 @@ def compute_region_volumes(
volume = result.loc[list(id_set), "id_volume"].sum()
volumes.append(volume)
result["volume"] = volumes

return result
40 changes: 33 additions & 7 deletions tests/densities/test_fitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,17 +273,24 @@ def test_compute_average_intensities(region_map, hierarchy_info):


def test_linear_fitting_xy():
actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0])
min_data_points = 1
actual = tested.linear_fitting_xy(
[0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0], min_data_points=min_data_points
)
assert np.allclose(actual["coefficient"], 2.0)
assert np.allclose(actual["r_square"], 1.0)
assert not np.isinf(actual["standard_deviation"])

actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5])
actual = tested.linear_fitting_xy(
[0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5], min_data_points=min_data_points
)
assert np.allclose(actual["coefficient"], 2.0)
assert not np.isinf(actual["standard_deviation"])
assert np.allclose(actual["r_square"], 0.89286)

actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0])
actual = tested.linear_fitting_xy(
[0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0], min_data_points=min_data_points
)
assert np.allclose(actual["coefficient"], 2.0)
assert not np.isinf(actual["standard_deviation"])
assert np.allclose(actual["r_square"], 1.0)
Expand Down Expand Up @@ -319,7 +326,10 @@ def test_compute_fitting_coefficients(hierarchy_info):
data = get_fitting_input_data_(hierarchy_info)

actual = tested.compute_fitting_coefficients(
data["groups"], data["intensities"], data["densities"]
data["groups"],
data["intensities"],
data["densities"],
min_data_points=1,
)

for group_name in ["Whole", "Central lobule"]:
Expand All @@ -340,18 +350,33 @@ def test_compute_fitting_coefficients_exceptions(hierarchy_info):
data["densities"].drop(index=["Central lobule"], inplace=True)

with pytest.raises(AtlasDensitiesError):
tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"])
tested.compute_fitting_coefficients(
data["groups"],
data["intensities"],
data["densities"],
min_data_points=1,
)

data = get_fitting_input_data_(hierarchy_info)
data["densities"].drop(columns=["pv+"], inplace=True)

with pytest.raises(AtlasDensitiesError):
tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"])
tested.compute_fitting_coefficients(
data["groups"],
data["intensities"],
data["densities"],
min_data_points=1,
)

data = get_fitting_input_data_(hierarchy_info)
data["densities"].at["Lobule II", "pv+_standard_deviation"] = np.nan
with pytest.raises(AssertionError):
tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"])
tested.compute_fitting_coefficients(
data["groups"],
data["intensities"],
data["densities"],
min_data_points=1,
)


@pytest.fixture
Expand Down Expand Up @@ -525,6 +550,7 @@ def test_linear_fitting(group_ids_config):
data["average_densities"],
data["homogenous_regions"],
group_ids_config=group_ids_config,
min_data_points=1,
)
warnings_ = [w for w in warnings_ if isinstance(w.message, AtlasDensitiesWarning)]
# Three warnings for recording NaN coefficients, three warnings for using them
Expand Down
Loading