From feedf3366405944b18a6c6be2b42e82d2f3c2af7 Mon Sep 17 00:00:00 2001 From: Jirawat I Date: Mon, 19 Apr 2021 12:18:14 +0700 Subject: [PATCH 1/2] Add conditional_bdm to compute BDM(x|y) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add `conditional_bdm(X,Y)` and `test_conditional_bdm` based on supplementary from [Hernández-Orozco S, Zenil H, (2021)](doi:10.3389/frai.2020.567356) - add compute_f_of_intersect used in Coarse Conditional BDM to compute f(n_xj, n_yj) - add pytest mark for slow in `tox.ini` --- pybdm/bdm.py | 115 ++++++++++++++++++++++++++++++++++++++++++++++ setup.cfg | 1 + tests/conftest.py | 6 ++- tests/test_bdm.py | 8 ++++ tox.ini | 5 ++ 5 files changed, 134 insertions(+), 1 deletion(-) diff --git a/pybdm/bdm.py b/pybdm/bdm.py index 8bc6939..4bd6023 100644 --- a/pybdm/bdm.py +++ b/pybdm/bdm.py @@ -398,6 +398,121 @@ def bdm(self, X, normalized=False, check_data=True): cmx = (cmx - min_cmx) / (max_cmx - min_cmx) return cmx + def conditional_bdm(self, X, Y, min_length=0, check_data=True): + """Approximate complexity of a Coarse Conditional BDM(x|y) + [Hernández-Orozco S, Zenil H, (2021)](doi:10.3389/frai.2020.567356) + + Parameters + ---------- + X : array_like + Dataset representation as a :py:class:`numpy.ndarray`. + Number of axes must agree with the `ndim` attribute. + Y : array_like + Dataset representation as a :py:class:`numpy.ndarray`. + Number of axes must agree with the `ndim` attribute. + min_length : int + Minimum parts' length. Non-negative. + In case of multidimensional objects it specifies minimum + length of any single dimension. + Default of 0 will use the min(X.shape,Y.shape) + check_data : bool + Should data format be checked. + May be disabled to gain some speed when calling multiple times. + + Returns + ------- + float + Approximate conditional algorithmic complexity K(x|y). + + Raises + ------ + TypeError + If `X` or `Y` is not an integer array and `check_data=True`. + ValueError + If `X` or `Y` has more than `nsymbols` unique values + and `check_data=True`. + ValueError + If `X` or `Y` has symbols outside of the ``0`` to `nsymbols-1` range + and `check_data=True`. + ValueError + If computed BDM value is 0 and `raise_if_zero` is ``True``. + + Notes + ----- + Detailed description can be found in :doc:`theory`. + + Examples + -------- + >>> import numpy as np + >>> bdm = BDM(ndim=1, partition=PartitionCorrelated, shift=3) + >>> X = encoding.array_from_string('010101010101010101111', (21,)) + >>> Y = encoding.array_from_string('010', (3,)) + >>> bdm.conditional_bdm(X, Y) # doctest: +FLOAT_CMP + 14.071500815885443 + """ + if check_data: + self._check_data(X) + self._check_data(Y) + + # Backup previous value of shape in partition algorithm + old_shape = self.partition.shape + # Find new minimal shape + shape = list(old_shape) + for i in range(0, len(old_shape)): + shape[i] = min(min_length if min_length > 0 else old_shape[i], Y.shape[i]) + # use new shape in partition algorithm + self.partition.shape = tuple(shape) + # Find adjX and adjY + adjX = self.decompose_and_count(X) + adjY = self.decompose_and_count(Y) + # Find set difference from adjX and adjY + adjDiff = Counter() + for key, count in adjX.items(): + if key not in adjY: + adjDiff[key] = count + # Restore previous value of shape in partition algorithm + self.partition.shape = old_shape + # Calculate the BDM(x|y) + cmx = self.compute_bdm(adjDiff) + self.compute_f_of_intersect(adjX, adjY) + if self.raise_if_zero and options.get('raise_if_zero') and cmx == 0: + raise ValueError("Computed BDM is 0, dataset may have incorrect dimensions") + return cmx + + def compute_f_of_intersect(self, adjX, adjY): + """Compute additional information f(n_xi, n_yi) based on Coarse Conditional BDM(x|y). + [Hernández-Orozco S, Zenil H, (2021)](doi:10.3389/frai.2020.567356) + + Parameters + ---------- + *counters : + Counter objects grouping object keys and occurences. + + Returns + ------- + float + f(n_xi, n_yi) + + Notes + ----- + Detailed description can be found in :doc:`theory`. + + Examples + -------- + >>> from collections import Counter + >>> bdm = BDM(ndim=1) + >>> c1 = Counter([('111111111111', 1.95207842085224e-08), ('111111111111', 1.95207842085224e-08)]) + >>> c2 = Counter([('111111111111', 1.95207842085224e-08)]) + >>> bdm.compute_f_of_intersect(c1, c2) # doctest: +FLOAT_CMP + 1.0 + """ + if not isinstance(adjX, Counter) or not isinstance(adjY, Counter): + return NotImplemented + bdm = 0 + for elem, count in adjX.items(): + if elem in adjY and adjY[elem] != count: + bdm += log2(count) + return bdm + def nbdm(self, X, **kwds): """Alias for normalized BDM diff --git a/setup.cfg b/setup.cfg index 99ea8cd..fef81e6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,3 +26,4 @@ doctest_plus = enabled filterwarnings = ignore:CTM dataset does not contain object ignore:Using or importing the ABCs + \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 4898c9b..c002d9f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,7 @@ """*PyTest* configuration and general purpose fixtures.""" import pytest from pybdm import BDM -from pybdm.partitions import PartitionRecursive +from pybdm.partitions import PartitionCorrelated, PartitionRecursive def pytest_addoption(parser): @@ -51,3 +51,7 @@ def bdm_d2(): @pytest.fixture(scope='session') def bdm_d1_b9(): return BDM(ndim=1, nsymbols=9, partition=PartitionRecursive, min_length=1) + +@pytest.fixture(scope='session') +def bdm_d1_collapse3(): + return BDM(ndim=1, partition=PartitionCorrelated, shift=3) diff --git a/tests/test_bdm.py b/tests/test_bdm.py index 681ed3b..2c5654c 100755 --- a/tests/test_bdm.py +++ b/tests/test_bdm.py @@ -162,6 +162,14 @@ def test_nent_d2(self, bdm_d2, X, expected): output = bdm_d2.nent(X) assert output == approx(expected) + @pytest.mark.parametrize('X,Y,expected', [ + (array_from_string('010101010101010101111', shape=(21,)), array_from_string('010', shape=(3,)), 14.071500815885443), + (array_from_string('010101010101010101111', shape=(21,)), array_from_string('000', shape=(3,)), 19.57688365108973), + ]) + def test_conditional_bdm(self, bdm_d1_collapse3, X, Y, expected): + output = bdm_d1_collapse3.conditional_bdm(X, Y) + assert output == approx(expected) + @pytest.mark.slow def test_bdm_parallel(self, bdm_d2): X = np.ones((500, 500), dtype=int) diff --git a/tox.ini b/tox.ini index 302d0be..9aa7cb9 100644 --- a/tox.ini +++ b/tox.ini @@ -5,6 +5,11 @@ [tox] envlist = py35, py36, py37, style, docs +[pytest] +markers = + slow: marks tests as slow (deselect with '-m "not slow"') + serial + [testenv] setenv = PYTHONPATH = {toxinidir}:{toxinidir}/pybdm From 71461aa08396034e7cf81cddc11c4dcfad97296e Mon Sep 17 00:00:00 2001 From: Jirawat I Date: Mon, 19 Apr 2021 13:33:51 +0700 Subject: [PATCH 2/2] fixed lints and docs --- docs/conf.py | 1 + docs/references.bib | 11 +++++++++++ pybdm/algorithms.py | 2 +- pybdm/bdm.py | 29 ++++++++++++++--------------- pybdm/options.py | 4 ++-- pybdm/utils.py | 2 +- tests/test_bdm.py | 12 ++++++++++-- tests/test_partitions.py | 2 +- tests/test_perturbation.py | 4 ++-- 9 files changed, 43 insertions(+), 24 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 462b62a..7636cfe 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -39,6 +39,7 @@ 'sphinx.ext.mathjax', 'sphinxcontrib.bibtex' ] +bibtex_bibfiles = ['references.bib'] # Napoleon settings diff --git a/docs/references.bib b/docs/references.bib index 18f87a2..7148746 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -210,3 +210,14 @@ @article{morzy_measuring_2017 file = {Morzy i in. - 2017 - On Measuring the Complexity of Networks Kolmogoro.pdf:/home/sztal/Zotero/storage/JEWZ5RI6/Morzy i in. - 2017 - On Measuring the Complexity of Networks Kolmogoro.pdf:application/pdf} } +@article{hernndez_ml_2021, + doi = {10.3389/frai.2020.567356}, + url = {https://doi.org/10.3389/frai.2020.567356}, + year = {2021}, + month = jan, + publisher = {Frontiers Media {SA}}, + volume = {3}, + author = {Santiago Hern{\'{a}}ndez-Orozco and Hector Zenil and J\"{u}rgen Riedel and Adam Uccello and Narsis A. Kiani and Jesper Tegn{\'{e}}r}, + title = {Algorithmic Probability-Guided Machine Learning on Non-Differentiable Spaces}, + journal = {Frontiers in Artificial Intelligence} +} diff --git a/pybdm/algorithms.py b/pybdm/algorithms.py index 694440e..86cbe61 100644 --- a/pybdm/algorithms.py +++ b/pybdm/algorithms.py @@ -258,7 +258,7 @@ def run(self, idx=None, values=None, keep_changes=False): """ if idx is None: indexes = [ range(k) for k in self.X.shape ] - idx = np.array([ x for x in product(*indexes) ], dtype=int) + idx = np.array(list(product(*indexes)), dtype=int) if values is None: values = np.full((idx.shape[0], ), -1, dtype=int) return np.apply_along_axis( diff --git a/pybdm/bdm.py b/pybdm/bdm.py index 4bd6023..67af1d0 100644 --- a/pybdm/bdm.py +++ b/pybdm/bdm.py @@ -129,21 +129,21 @@ def __init__(self, ndim, nsymbols=2, shape=None, partition=PartitionIgnore, self.ndim = ndim try: self.ctmname = ctmname if ctmname else self._ndim_to_ctm[(ndim, nsymbols)] - except KeyError: + except KeyError as key_error: msg = "no CTM dataset for 'ndim={}' and 'nsymbols={}'".format( ndim, nsymbols ) - raise CTMDatasetNotFoundError(msg) + raise CTMDatasetNotFoundError(msg) from key_error try: nsymbols, _shape = self.ctmname.split('-')[-2:] - except ValueError: + except ValueError as value_error: msg = "incorrect 'ctmname'; it should be in format " + \ "'name-b-d'" - raise BDMConfigurationError(msg) + raise BDMConfigurationError(msg) from value_error self.nsymbols = int(nsymbols[1:]) if shape is None: shape = tuple(int(x) for x in _shape[1:].split('x')) - if any([ x != shape[0] for x in shape ]): + if any(x != shape[0] for x in shape): raise BDMConfigurationError("'shape' has to be equal in each dimension") ctm, ctm_missing = get_ctm_dataset(self.ctmname) self._ctm = ctm @@ -399,8 +399,7 @@ def bdm(self, X, normalized=False, check_data=True): return cmx def conditional_bdm(self, X, Y, min_length=0, check_data=True): - """Approximate complexity of a Coarse Conditional BDM(x|y) - [Hernández-Orozco S, Zenil H, (2021)](doi:10.3389/frai.2020.567356) + """Approximate complexity of a Coarse Conditional BDM(x|y) :cite:`hernndez_ml_2021` Parameters ---------- @@ -453,12 +452,12 @@ def conditional_bdm(self, X, Y, min_length=0, check_data=True): if check_data: self._check_data(X) self._check_data(Y) - - # Backup previous value of shape in partition algorithm + + # Backup previous value of shape in partition algorithm old_shape = self.partition.shape # Find new minimal shape shape = list(old_shape) - for i in range(0, len(old_shape)): + for i, _ in enumerate(old_shape): shape[i] = min(min_length if min_length > 0 else old_shape[i], Y.shape[i]) # use new shape in partition algorithm self.partition.shape = tuple(shape) @@ -469,7 +468,7 @@ def conditional_bdm(self, X, Y, min_length=0, check_data=True): adjDiff = Counter() for key, count in adjX.items(): if key not in adjY: - adjDiff[key] = count + adjDiff[key] = count # Restore previous value of shape in partition algorithm self.partition.shape = old_shape # Calculate the BDM(x|y) @@ -477,10 +476,10 @@ def conditional_bdm(self, X, Y, min_length=0, check_data=True): if self.raise_if_zero and options.get('raise_if_zero') and cmx == 0: raise ValueError("Computed BDM is 0, dataset may have incorrect dimensions") return cmx - + def compute_f_of_intersect(self, adjX, adjY): """Compute additional information f(n_xi, n_yi) based on Coarse Conditional BDM(x|y). - [Hernández-Orozco S, Zenil H, (2021)](doi:10.3389/frai.2020.567356) + :cite:`hernndez_ml_2021` Parameters ---------- @@ -512,7 +511,7 @@ def compute_f_of_intersect(self, adjX, adjY): if elem in adjY and adjY[elem] != count: bdm += log2(count) return bdm - + def nbdm(self, X, **kwds): """Alias for normalized BDM @@ -629,7 +628,7 @@ def _check_data(self, X): raise ValueError("'X' has more than {} unique symbols".format( self.nsymbols )) - valid_symbols = np.array([ _ for _ in range(self.nsymbols) ]) + valid_symbols = np.arange(self.nsymbols) bad_symbols = symbols[~np.isin(symbols, valid_symbols)] if bad_symbols.size > 0: raise ValueError("'X' contains symbols outside of [0, {}]: {}".format( diff --git a/pybdm/options.py b/pybdm/options.py index 73c0ea8..f953091 100644 --- a/pybdm/options.py +++ b/pybdm/options.py @@ -59,5 +59,5 @@ def get(name=None): return _options.copy() try: return _options[name] - except KeyError: - raise KeyError("there is no '{}' option".format(name)) + except KeyError as key_error: + raise KeyError("there is no '{}' option".format(name)) from key_error diff --git a/pybdm/utils.py b/pybdm/utils.py index 922ecd6..cce3fe8 100644 --- a/pybdm/utils.py +++ b/pybdm/utils.py @@ -150,7 +150,7 @@ def list_ctm_datasets(): >>> list_ctm_datasets() ['CTM-B2-D12', 'CTM-B2-D4x4', 'CTM-B4-D12', 'CTM-B5-D12', 'CTM-B6-D12', 'CTM-B9-D12'] """ - return [ x for x in sorted(_ctm_datasets.keys()) ] + return list(sorted(_ctm_datasets.keys())) @lru_cache(maxsize=2**int(np.ceil(np.log2(len(_ctm_datasets))))) def get_ctm_dataset(name): diff --git a/tests/test_bdm.py b/tests/test_bdm.py index 2c5654c..ffc1ed7 100755 --- a/tests/test_bdm.py +++ b/tests/test_bdm.py @@ -163,8 +163,16 @@ def test_nent_d2(self, bdm_d2, X, expected): assert output == approx(expected) @pytest.mark.parametrize('X,Y,expected', [ - (array_from_string('010101010101010101111', shape=(21,)), array_from_string('010', shape=(3,)), 14.071500815885443), - (array_from_string('010101010101010101111', shape=(21,)), array_from_string('000', shape=(3,)), 19.57688365108973), + ( + array_from_string('010101010101010101111', shape=(21,)), + array_from_string('010', shape=(3,)), + 14.071500815885443 + ), + ( + array_from_string('010101010101010101111', shape=(21,)), + array_from_string('000', shape=(3,)), + 19.57688365108973 + ), ]) def test_conditional_bdm(self, bdm_d1_collapse3, X, Y, expected): output = bdm_d1_collapse3.conditional_bdm(X, Y) diff --git a/tests/test_partitions.py b/tests/test_partitions.py index dfe5044..faa5c8a 100644 --- a/tests/test_partitions.py +++ b/tests/test_partitions.py @@ -5,7 +5,7 @@ def _test_decompose(partition, X, expected): - output = [ p for p in partition.decompose(X) ] + output = list(partition.decompose(X)) assert len(output) == len(expected) assert all(np.array_equal(o, e) for o, e in zip(output, expected)) diff --git a/tests/test_perturbation.py b/tests/test_perturbation.py index b625d8f..72d0d64 100644 --- a/tests/test_perturbation.py +++ b/tests/test_perturbation.py @@ -82,7 +82,7 @@ class TestPerturbationExperiment: ]) def test_idx_to_parts(self, perturbation, idx, expected): expected = [ perturbation.X[s] for s in expected ] - output = [ x for x in perturbation._idx_to_parts(idx) ] + output = list(perturbation._idx_to_parts(idx)) assert len(output) == len(expected) for o, e in zip(output, expected): assert np.array_equal(o, e) @@ -104,7 +104,7 @@ def test_idx_to_parts(self, perturbation, idx, expected): def test_idx_to_parts_overlap(self, perturbation_overlap, idx, expected): perturbation = perturbation_overlap expected = [ perturbation.X[s] for s in expected ] - output = [ x for x in perturbation._idx_to_parts(idx) ] + output = list(perturbation._idx_to_parts(idx)) assert len(output) == len(expected) for o, e in zip(output, expected): assert np.array_equal(o, e)