diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5b0ff09..29bc6a7 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,11 +1,41 @@ Changelog ========= +Version 0.2.6 +------------- +* Updating regression requirements, teach + `atlas-densities cell-densities fit-average-densities` to have `--min-data-points` (#81) + +Version 0.2.5 +------------- + +* Fix nans in compute_region_volumes (#80) + +Version 0.2.4 +------------- + +* Add r_square values to the results of the fitting. (#68) +* Add check to the region filter in case the input dataset is all equal to 0 for the region of interest. (#74) +* Fix fitting and optimization steps (#75) +* Faster _compute_region_cell_counts (#72) +* Cleanup app tests (#78) +* Faster compute average intensities (#73) +* Speedup mtype density creation from probability maps using voxcell's ValueToIndexVoxels (#79) + +Version 0.2.3 +------------- + +* remove pkg_resources, use importlib (#62) +* Drop literature values from regions that are not in hierarchy or not in the annotation volume (#61) + Version 0.2.2 +------------- + * add ``atlas-densities combination manipulate`` Version 0.2.1 ------------- + * The following *cell-density* sub-commands can now optionally take a ``--group-ids-config-path``: *cell-density*, *glia-cell-densities*, *inhibitory-and-excitatory-neuron-densities*, *fit-average-densities* diff --git a/README.rst b/README.rst index 7b3d708..0224a25 100644 --- a/README.rst +++ b/README.rst @@ -286,6 +286,7 @@ respective standard deviation and `coefficient of determination`_ (`r_square`). --fitted-densities-output-path=data/ccfv2/first_estimates/first_estimates.csv \ --fitting-maps-output-path=data/ccfv2/first_estimates/fitting.json +Note: One can use the ``--min-data-points`` to require a minimum of points for the linear regression; the default is 1. Compute inhibitory/excitatory neuron densities ---------------------------------------------- diff --git a/atlas_densities/app/cell_densities.py b/atlas_densities/app/cell_densities.py index 961b327..e168de2 100644 --- a/atlas_densities/app/cell_densities.py +++ b/atlas_densities/app/cell_densities.py @@ -808,6 +808,13 @@ def measurements_to_average_densities( help="Path to density groups ids config", show_default=True, ) +@click.option( + "--min-data-points", + type=int, + default=1, + help="minimum number of datapoints required for running the linear regression.", + show_default=True, +) @log_args(L) def fit_average_densities( hierarchy_path, @@ -820,6 +827,7 @@ def fit_average_densities( fitted_densities_output_path, fitting_maps_output_path, group_ids_config_path, + min_data_points, ): # pylint: disable=too-many-arguments, too-many-locals """ Estimate average cell densities of brain regions in `hierarchy_path` for the cell types @@ -935,6 +943,7 @@ def fit_average_densities( cell_density_stddev, region_name=region_name, group_ids_config=group_ids_config, + min_data_points=min_data_points, ) # Turn index into column to ease off the save and load operations on csv files diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 6f9d558..4f6fe0b 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -367,7 +367,10 @@ def compute_average_intensities( def linear_fitting_xy( - xdata: list[float], ydata: list[float], sigma: Union[list[float], FloatArray] + xdata: list[float], + ydata: list[float], + sigma: Union[list[float], FloatArray], + min_data_points: int, ) -> dict[str, float]: """ Compute the coefficient of the linear least-squares regression of the point cloud @@ -380,6 +383,10 @@ def linear_fitting_xy( These are the standard deviations of the values in `ydata`. Zero values are replaced by the least positive value if it exists, otherwise by 1.0. + min_data_points: minimum number of datapoints required for running + the linear regression. If the number of datapoints is less than + min_data_points then no fitting is done, and np.nan values are + returned. Returns: a dict of the form {"coefficient": , @@ -389,9 +396,8 @@ def linear_fitting_xy( Raises: AtlasDensitiesError if some of the `sigma`values are negative. """ - - if len(xdata) == 0: - return {"coefficient": np.nan, "standard_deviation": np.nan} + if len(xdata) < min_data_points: + return {"coefficient": np.nan, "standard_deviation": np.nan, "r_square": np.nan} sigma = np.array(sigma) if np.any(sigma < 0.0): @@ -422,6 +428,7 @@ def _optimize_func(x, coefficient): ) ss_reg = np.sum((_optimize_func(xdata, parameters[0][0]) - np.mean(ydata)) ** 2) ss_tot = np.sum((np.array(ydata) - _optimize_func(xdata, parameters[0][0])) ** 2) + ss_reg + L.debug("Length of xdata = %s. The ss_reg is %s and ss_tot is %s", xdata, ss_reg, ss_tot) # if total sum of square is null, no variance can be explained by the fitting return { "coefficient": parameters[0][0], @@ -434,7 +441,10 @@ def _optimize_func(x, coefficient): def compute_fitting_coefficients( - groups: dict[str, set[str]], average_intensities: pd.DataFrame, densities: pd.DataFrame + groups: dict[str, set[str]], + average_intensities: pd.DataFrame, + densities: pd.DataFrame, + min_data_points: int, ) -> FittingData: """ Compute the linear fitting coefficient of the cloud of 2D points (average marker intensity, @@ -477,7 +487,7 @@ def compute_fitting_coefficients( The "standard_deviation" value is the standard deviation of the coefficient value. The "r_square" value is the coefficient of determination of the coefficient value. """ - + # pylint: disable=too-many-locals if len(densities.index) != len(average_intensities.index) or np.any( densities.index != average_intensities.index ): @@ -545,8 +555,14 @@ def compute_fitting_coefficients( L.info("Computing regression coefficients for %d cell types ...", len(cell_types)) for cell_type in tqdm(cell_types): cloud = clouds[group_name][cell_type] + L.debug( + "The length of training data for %s and %s is %s", + group_name, + cell_type, + cloud["xdata"], + ) result[group_name][cell_type] = linear_fitting_xy( - cloud["xdata"], cloud["ydata"], cloud["sigma"] + cloud["xdata"], cloud["ydata"], cloud["sigma"], min_data_points ) if np.isnan(result[group_name][cell_type]["coefficient"]): warnings.warn( @@ -730,6 +746,7 @@ def linear_fitting( # pylint: disable=too-many-arguments cell_density_stddevs: Optional[dict[str, float]] = None, region_name: str = "root", group_ids_config: dict | None = None, + min_data_points: int = 5, ) -> pd.DataFrame: """ Estimate the average densities of every region in `region_map` using a linear fitting @@ -772,6 +789,10 @@ def linear_fitting( # pylint: disable=too-many-arguments standard deviations of average cell densities of the corresponding regions. region_name: (str) name of the root region of interest group_ids_config: mapping of regions to their constituent ids + min_data_points: minimum number of datapoints required for running + the linear regression. If the number of datapoints is less than + min_data_points then no fitting is done, and np.nan values are + returned. Returns: tuple (densities, fitting_coefficients) @@ -782,6 +803,7 @@ def linear_fitting( # pylint: disable=too-many-arguments fitting_coefficients: dict returned by :fun:`atlas_densities.densities.fitting.compute_fitting_coefficients`. """ + # pylint: disable=too-many-locals assert group_ids_config is not None L.info("Checking input data frames sanity ...") _check_average_densities_sanity(average_densities) @@ -833,7 +855,10 @@ def linear_fitting( # pylint: disable=too-many-arguments L.info("Computing fitting coefficients ...") fitting_coefficients = compute_fitting_coefficients( - groups, average_intensities, densities.drop(densities.index[indexes]) + groups, + average_intensities, + densities.drop(densities.index[indexes]), + min_data_points=min_data_points, ) L.info("Fitting unknown average densities ...") fit_unknown_densities(groups, average_intensities, densities, fitting_coefficients) diff --git a/atlas_densities/densities/utils.py b/atlas_densities/densities/utils.py index 2e76655..c26b002 100644 --- a/atlas_densities/densities/utils.py +++ b/atlas_densities/densities/utils.py @@ -538,5 +538,4 @@ def compute_region_volumes( volume = result.loc[list(id_set), "id_volume"].sum() volumes.append(volume) result["volume"] = volumes - return result diff --git a/tests/densities/test_fitting.py b/tests/densities/test_fitting.py index b696b6f..935c016 100644 --- a/tests/densities/test_fitting.py +++ b/tests/densities/test_fitting.py @@ -273,17 +273,24 @@ def test_compute_average_intensities(region_map, hierarchy_info): def test_linear_fitting_xy(): - actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0]) + min_data_points = 1 + actual = tested.linear_fitting_xy( + [0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0], min_data_points=min_data_points + ) assert np.allclose(actual["coefficient"], 2.0) assert np.allclose(actual["r_square"], 1.0) assert not np.isinf(actual["standard_deviation"]) - actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5]) + actual = tested.linear_fitting_xy( + [0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5], min_data_points=min_data_points + ) assert np.allclose(actual["coefficient"], 2.0) assert not np.isinf(actual["standard_deviation"]) assert np.allclose(actual["r_square"], 0.89286) - actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0]) + actual = tested.linear_fitting_xy( + [0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0], min_data_points=min_data_points + ) assert np.allclose(actual["coefficient"], 2.0) assert not np.isinf(actual["standard_deviation"]) assert np.allclose(actual["r_square"], 1.0) @@ -319,7 +326,10 @@ def test_compute_fitting_coefficients(hierarchy_info): data = get_fitting_input_data_(hierarchy_info) actual = tested.compute_fitting_coefficients( - data["groups"], data["intensities"], data["densities"] + data["groups"], + data["intensities"], + data["densities"], + min_data_points=1, ) for group_name in ["Whole", "Central lobule"]: @@ -340,18 +350,33 @@ def test_compute_fitting_coefficients_exceptions(hierarchy_info): data["densities"].drop(index=["Central lobule"], inplace=True) with pytest.raises(AtlasDensitiesError): - tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"]) + tested.compute_fitting_coefficients( + data["groups"], + data["intensities"], + data["densities"], + min_data_points=1, + ) data = get_fitting_input_data_(hierarchy_info) data["densities"].drop(columns=["pv+"], inplace=True) with pytest.raises(AtlasDensitiesError): - tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"]) + tested.compute_fitting_coefficients( + data["groups"], + data["intensities"], + data["densities"], + min_data_points=1, + ) data = get_fitting_input_data_(hierarchy_info) data["densities"].at["Lobule II", "pv+_standard_deviation"] = np.nan with pytest.raises(AssertionError): - tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"]) + tested.compute_fitting_coefficients( + data["groups"], + data["intensities"], + data["densities"], + min_data_points=1, + ) @pytest.fixture @@ -525,6 +550,7 @@ def test_linear_fitting(group_ids_config): data["average_densities"], data["homogenous_regions"], group_ids_config=group_ids_config, + min_data_points=1, ) warnings_ = [w for w in warnings_ if isinstance(w.message, AtlasDensitiesWarning)] # Three warnings for recording NaN coefficients, three warnings for using them