sztal · nodtem66 · Apr 19, 2021 · Apr 19, 2021
diff --git a/docs/conf.py b/docs/conf.py
@@ -39,6 +39,7 @@
     'sphinx.ext.mathjax',
     'sphinxcontrib.bibtex'
 ]
+bibtex_bibfiles = ['references.bib']
 
 
 # Napoleon settings

diff --git a/docs/references.bib b/docs/references.bib
@@ -210,3 +210,14 @@ @article{morzy_measuring_2017
 	file = {Morzy i in. - 2017 - On Measuring the Complexity of Networks Kolmogoro.pdf:/home/sztal/Zotero/storage/JEWZ5RI6/Morzy i in. - 2017 - On Measuring the Complexity of Networks Kolmogoro.pdf:application/pdf}
 }
 
+@article{hernndez_ml_2021,
+  doi = {10.3389/frai.2020.567356},
+  url = {https://doi.org/10.3389/frai.2020.567356},
+  year = {2021},
+  month = jan,
+  publisher = {Frontiers Media {SA}},
+  volume = {3},
+  author = {Santiago Hern{\'{a}}ndez-Orozco and Hector Zenil and J\"{u}rgen Riedel and Adam Uccello and Narsis A. Kiani and Jesper Tegn{\'{e}}r},
+  title = {Algorithmic Probability-Guided Machine Learning on Non-Differentiable Spaces},
+  journal = {Frontiers in Artificial Intelligence}
+}
diff --git a/pybdm/algorithms.py b/pybdm/algorithms.py
@@ -258,7 +258,7 @@ def run(self, idx=None, values=None, keep_changes=False):
         """
         if idx is None:
             indexes = [ range(k) for k in self.X.shape ]
-            idx = np.array([ x for x in product(*indexes) ], dtype=int)
+            idx = np.array(list(product(*indexes)), dtype=int)
         if values is None:
             values = np.full((idx.shape[0], ), -1, dtype=int)
         return np.apply_along_axis(

diff --git a/pybdm/bdm.py b/pybdm/bdm.py
@@ -129,21 +129,21 @@ def __init__(self, ndim, nsymbols=2, shape=None, partition=PartitionIgnore,
         self.ndim = ndim
         try:
             self.ctmname = ctmname if ctmname else self._ndim_to_ctm[(ndim, nsymbols)]
-        except KeyError:
+        except KeyError as key_error:
             msg = "no CTM dataset for 'ndim={}' and 'nsymbols={}'".format(
                 ndim, nsymbols
             )
-            raise CTMDatasetNotFoundError(msg)
+            raise CTMDatasetNotFoundError(msg) from key_error
         try:
             nsymbols, _shape = self.ctmname.split('-')[-2:]
-        except ValueError:
+        except ValueError as value_error:
             msg = "incorrect 'ctmname'; it should be in format " + \
                 "'name-b<nsymbols>-d<shape>'"
-            raise BDMConfigurationError(msg)
+            raise BDMConfigurationError(msg) from value_error
         self.nsymbols = int(nsymbols[1:])
         if shape is None:
             shape = tuple(int(x) for x in _shape[1:].split('x'))
-        if any([ x != shape[0] for x in shape ]):
+        if any(x != shape[0] for x in shape):
             raise BDMConfigurationError("'shape' has to be equal in each dimension")
         ctm, ctm_missing = get_ctm_dataset(self.ctmname)
         self._ctm = ctm
@@ -398,6 +398,120 @@ def bdm(self, X, normalized=False, check_data=True):
             cmx = (cmx - min_cmx) / (max_cmx - min_cmx)
         return cmx
 
+    def conditional_bdm(self, X, Y, min_length=0, check_data=True):
+        """Approximate complexity of a Coarse Conditional BDM(x|y) :cite:`hernndez_ml_2021`
+
+        Parameters
+        ----------
+        X : array_like
+            Dataset representation as a :py:class:`numpy.ndarray`.
+            Number of axes must agree with the `ndim` attribute.
+        Y : array_like
+            Dataset representation as a :py:class:`numpy.ndarray`.
+            Number of axes must agree with the `ndim` attribute.
+        min_length : int
+            Minimum parts' length. Non-negative.
+            In case of multidimensional objects it specifies minimum
+            length of any single dimension.
+            Default of 0 will use the min(X.shape,Y.shape)
+        check_data : bool
+            Should data format be checked.
+            May be disabled to gain some speed when calling multiple times.
+
+        Returns
+        -------
+        float
+            Approximate conditional algorithmic complexity K(x|y).
+
+        Raises
+        ------
+        TypeError
+            If `X` or `Y` is not an integer array and `check_data=True`.
+        ValueError
+            If `X` or `Y` has more than `nsymbols` unique values
+            and `check_data=True`.
+        ValueError
+            If `X` or `Y` has symbols outside of the ``0`` to `nsymbols-1` range
+            and `check_data=True`.
+        ValueError
+            If computed BDM value is 0 and `raise_if_zero` is ``True``.
+
+        Notes
+        -----
+        Detailed description can be found in :doc:`theory`.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> bdm = BDM(ndim=1, partition=PartitionCorrelated, shift=3)
+        >>> X = encoding.array_from_string('010101010101010101111', (21,))
+        >>> Y = encoding.array_from_string('010', (3,))
+        >>> bdm.conditional_bdm(X, Y) # doctest: +FLOAT_CMP
+        14.071500815885443
+        """
+        if check_data:
+            self._check_data(X)
+            self._check_data(Y)
+
+        # Backup previous value of shape in partition algorithm
+        old_shape = self.partition.shape
+        # Find new minimal shape
+        shape = list(old_shape)
+        for i, _ in enumerate(old_shape):
+            shape[i] = min(min_length if min_length > 0 else old_shape[i], Y.shape[i])
+        # use new shape in partition algorithm
+        self.partition.shape = tuple(shape)
+        # Find adjX and adjY
+        adjX = self.decompose_and_count(X)
+        adjY = self.decompose_and_count(Y)
+        # Find set difference from adjX and adjY
+        adjDiff = Counter()
+        for key, count in adjX.items():
+            if key not in adjY:
+                adjDiff[key] = count
+        # Restore previous value of shape in partition algorithm
+        self.partition.shape = old_shape
+        # Calculate the BDM(x|y)
+        cmx = self.compute_bdm(adjDiff) + self.compute_f_of_intersect(adjX, adjY)
+        if self.raise_if_zero and options.get('raise_if_zero') and cmx == 0:
+            raise ValueError("Computed BDM is 0, dataset may have incorrect dimensions")
+        return cmx
+
+    def compute_f_of_intersect(self, adjX, adjY):
+        """Compute additional information f(n_xi, n_yi) based on Coarse Conditional BDM(x|y).
+        :cite:`hernndez_ml_2021`
+
+        Parameters
+        ----------
+        *counters :
+            Counter objects grouping object keys and occurences.
+
+        Returns
+        -------
+        float
+            f(n_xi, n_yi)
+
+        Notes
+        -----
+        Detailed description can be found in :doc:`theory`.
+
+        Examples
+        --------
+        >>> from collections import Counter
+        >>> bdm = BDM(ndim=1)
+        >>> c1 = Counter([('111111111111', 1.95207842085224e-08), ('111111111111', 1.95207842085224e-08)])
+        >>> c2 = Counter([('111111111111', 1.95207842085224e-08)])
+        >>> bdm.compute_f_of_intersect(c1, c2) # doctest: +FLOAT_CMP
+        1.0
+        """
+        if not isinstance(adjX, Counter) or not isinstance(adjY, Counter):
+            return NotImplemented
+        bdm = 0
+        for elem, count in adjX.items():
+            if elem in adjY and adjY[elem] != count:
+                bdm += log2(count)
+        return bdm
+
     def nbdm(self, X, **kwds):
         """Alias for normalized BDM
 
@@ -514,7 +628,7 @@ def _check_data(self, X):
             raise ValueError("'X' has more than {} unique symbols".format(
                 self.nsymbols
             ))
-        valid_symbols = np.array([ _ for _ in range(self.nsymbols) ])
+        valid_symbols = np.arange(self.nsymbols)
         bad_symbols = symbols[~np.isin(symbols, valid_symbols)]
         if bad_symbols.size > 0:
             raise ValueError("'X' contains symbols outside of [0, {}]: {}".format(

diff --git a/pybdm/options.py b/pybdm/options.py
@@ -59,5 +59,5 @@ def get(name=None):
         return _options.copy()
     try:
         return _options[name]
-    except KeyError:
-        raise KeyError("there is no '{}' option".format(name))
+    except KeyError as key_error:
+        raise KeyError("there is no '{}' option".format(name)) from key_error
diff --git a/pybdm/utils.py b/pybdm/utils.py
@@ -150,7 +150,7 @@ def list_ctm_datasets():
     >>> list_ctm_datasets()
     ['CTM-B2-D12', 'CTM-B2-D4x4', 'CTM-B4-D12', 'CTM-B5-D12', 'CTM-B6-D12', 'CTM-B9-D12']
     """
-    return [ x for x in sorted(_ctm_datasets.keys()) ]
+    return list(sorted(_ctm_datasets.keys()))
 
 @lru_cache(maxsize=2**int(np.ceil(np.log2(len(_ctm_datasets)))))
 def get_ctm_dataset(name):

diff --git a/setup.cfg b/setup.cfg
@@ -26,3 +26,4 @@ doctest_plus = enabled
 filterwarnings =
     ignore:CTM dataset does not contain object
     ignore:Using or importing the ABCs
+
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,7 +1,7 @@
 """*PyTest* configuration and general purpose fixtures."""
 import pytest
 from pybdm import BDM
-from pybdm.partitions import PartitionRecursive
+from pybdm.partitions import PartitionCorrelated, PartitionRecursive
 
 
 def pytest_addoption(parser):
@@ -51,3 +51,7 @@ def bdm_d2():
 @pytest.fixture(scope='session')
 def bdm_d1_b9():
     return BDM(ndim=1, nsymbols=9, partition=PartitionRecursive, min_length=1)
+
+@pytest.fixture(scope='session')
+def bdm_d1_collapse3():
+    return BDM(ndim=1, partition=PartitionCorrelated, shift=3)
diff --git a/tests/test_bdm.py b/tests/test_bdm.py
@@ -162,6 +162,22 @@ def test_nent_d2(self, bdm_d2, X, expected):
         output = bdm_d2.nent(X)
         assert output == approx(expected)
 
+    @pytest.mark.parametrize('X,Y,expected', [
+        (
+            array_from_string('010101010101010101111', shape=(21,)),
+            array_from_string('010', shape=(3,)),
+            14.071500815885443
+        ),
+        (
+            array_from_string('010101010101010101111', shape=(21,)),
+            array_from_string('000', shape=(3,)),
+            19.57688365108973
+        ),
+    ])
+    def test_conditional_bdm(self, bdm_d1_collapse3, X, Y, expected):
+        output = bdm_d1_collapse3.conditional_bdm(X, Y)
+        assert output == approx(expected)
+
     @pytest.mark.slow
     def test_bdm_parallel(self, bdm_d2):
         X = np.ones((500, 500), dtype=int)

diff --git a/tests/test_partitions.py b/tests/test_partitions.py
@@ -5,7 +5,7 @@
 
 
 def _test_decompose(partition, X, expected):
-    output = [ p for p in partition.decompose(X) ]
+    output = list(partition.decompose(X))
     assert len(output) == len(expected)
     assert all(np.array_equal(o, e) for o, e in zip(output, expected))
 

diff --git a/tests/test_perturbation.py b/tests/test_perturbation.py
@@ -82,7 +82,7 @@ class TestPerturbationExperiment:
     ])
     def test_idx_to_parts(self, perturbation, idx, expected):
         expected = [ perturbation.X[s] for s in expected ]
-        output = [ x for x in perturbation._idx_to_parts(idx) ]
+        output = list(perturbation._idx_to_parts(idx))
         assert len(output) == len(expected)
         for o, e in zip(output, expected):
             assert np.array_equal(o, e)
@@ -104,7 +104,7 @@ def test_idx_to_parts(self, perturbation, idx, expected):
     def test_idx_to_parts_overlap(self, perturbation_overlap, idx,  expected):
         perturbation = perturbation_overlap
         expected = [ perturbation.X[s] for s in expected ]
-        output = [ x for x in perturbation._idx_to_parts(idx) ]
+        output = list(perturbation._idx_to_parts(idx))
         assert len(output) == len(expected)
         for o, e in zip(output, expected):
             assert np.array_equal(o, e)

diff --git a/tox.ini b/tox.ini
@@ -5,6 +5,11 @@
 [tox]
 envlist = py35, py36, py37, style, docs
 
+[pytest]
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    serial
+
 [testenv]
 setenv =
     PYTHONPATH = {toxinidir}:{toxinidir}/pybdm
Original file line number	Diff line number	Diff line change
Expand Up		@@ -26,3 +26,4 @@ doctest_plus = enabled
		filterwarnings =
		ignore:CTM dataset does not contain object
		ignore:Using or importing the ABCs