From feedf3366405944b18a6c6be2b42e82d2f3c2af7 Mon Sep 17 00:00:00 2001
From: Jirawat I <nodtem66@gmail.com>
Date: Mon, 19 Apr 2021 12:18:14 +0700
Subject: [PATCH 1/2] Add conditional_bdm to compute BDM(x|y)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- add `conditional_bdm(X,Y)` and `test_conditional_bdm` based on
  supplementary from [Hernández-Orozco S, Zenil H, (2021)](doi:10.3389/frai.2020.567356)
- add compute_f_of_intersect used in Coarse Conditional BDM
  to compute f(n_xj, n_yj)
- add pytest mark for slow in `tox.ini`
---
 pybdm/bdm.py      | 115 ++++++++++++++++++++++++++++++++++++++++++++++
 setup.cfg         |   1 +
 tests/conftest.py |   6 ++-
 tests/test_bdm.py |   8 ++++
 tox.ini           |   5 ++
 5 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/pybdm/bdm.py b/pybdm/bdm.py
index 8bc6939..4bd6023 100644
--- a/pybdm/bdm.py
+++ b/pybdm/bdm.py
@@ -398,6 +398,121 @@ def bdm(self, X, normalized=False, check_data=True):
             cmx = (cmx - min_cmx) / (max_cmx - min_cmx)
         return cmx
 
+    def conditional_bdm(self, X, Y, min_length=0, check_data=True):
+        """Approximate complexity of a Coarse Conditional BDM(x|y)
+        [Hernández-Orozco S, Zenil H, (2021)](doi:10.3389/frai.2020.567356)
+
+        Parameters
+        ----------
+        X : array_like
+            Dataset representation as a :py:class:`numpy.ndarray`.
+            Number of axes must agree with the `ndim` attribute.
+        Y : array_like
+            Dataset representation as a :py:class:`numpy.ndarray`.
+            Number of axes must agree with the `ndim` attribute.
+        min_length : int
+            Minimum parts' length. Non-negative.
+            In case of multidimensional objects it specifies minimum
+            length of any single dimension.
+            Default of 0 will use the min(X.shape,Y.shape)
+        check_data : bool
+            Should data format be checked.
+            May be disabled to gain some speed when calling multiple times.
+
+        Returns
+        -------
+        float
+            Approximate conditional algorithmic complexity K(x|y).
+
+        Raises
+        ------
+        TypeError
+            If `X` or `Y` is not an integer array and `check_data=True`.
+        ValueError
+            If `X` or `Y` has more than `nsymbols` unique values
+            and `check_data=True`.
+        ValueError
+            If `X` or `Y` has symbols outside of the ``0`` to `nsymbols-1` range
+            and `check_data=True`.
+        ValueError
+            If computed BDM value is 0 and `raise_if_zero` is ``True``.
+
+        Notes
+        -----
+        Detailed description can be found in :doc:`theory`.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> bdm = BDM(ndim=1, partition=PartitionCorrelated, shift=3)
+        >>> X = encoding.array_from_string('010101010101010101111', (21,))
+        >>> Y = encoding.array_from_string('010', (3,))
+        >>> bdm.conditional_bdm(X, Y) # doctest: +FLOAT_CMP
+        14.071500815885443
+        """
+        if check_data:
+            self._check_data(X)
+            self._check_data(Y)
+        
+        # Backup previous value of shape in partition algorithm 
+        old_shape = self.partition.shape
+        # Find new minimal shape
+        shape = list(old_shape)
+        for i in range(0, len(old_shape)):
+            shape[i] = min(min_length if min_length > 0 else old_shape[i], Y.shape[i])
+        # use new shape in partition algorithm
+        self.partition.shape = tuple(shape)
+        # Find adjX and adjY
+        adjX = self.decompose_and_count(X)
+        adjY = self.decompose_and_count(Y)
+        # Find set difference from adjX and adjY
+        adjDiff = Counter()
+        for key, count in adjX.items():
+            if key not in adjY:
+                adjDiff[key] = count    
+        # Restore previous value of shape in partition algorithm
+        self.partition.shape = old_shape
+        # Calculate the BDM(x|y)
+        cmx = self.compute_bdm(adjDiff) + self.compute_f_of_intersect(adjX, adjY)
+        if self.raise_if_zero and options.get('raise_if_zero') and cmx == 0:
+            raise ValueError("Computed BDM is 0, dataset may have incorrect dimensions")
+        return cmx
+    
+    def compute_f_of_intersect(self, adjX, adjY):
+        """Compute additional information f(n_xi, n_yi) based on Coarse Conditional BDM(x|y).
+        [Hernández-Orozco S, Zenil H, (2021)](doi:10.3389/frai.2020.567356)
+
+        Parameters
+        ----------
+        *counters :
+            Counter objects grouping object keys and occurences.
+
+        Returns
+        -------
+        float
+            f(n_xi, n_yi)
+
+        Notes
+        -----
+        Detailed description can be found in :doc:`theory`.
+
+        Examples
+        --------
+        >>> from collections import Counter
+        >>> bdm = BDM(ndim=1)
+        >>> c1 = Counter([('111111111111', 1.95207842085224e-08), ('111111111111', 1.95207842085224e-08)])
+        >>> c2 = Counter([('111111111111', 1.95207842085224e-08)])
+        >>> bdm.compute_f_of_intersect(c1, c2) # doctest: +FLOAT_CMP
+        1.0
+        """
+        if not isinstance(adjX, Counter) or not isinstance(adjY, Counter):
+            return NotImplemented
+        bdm = 0
+        for elem, count in adjX.items():
+            if elem in adjY and adjY[elem] != count:
+                bdm += log2(count)
+        return bdm
+    
     def nbdm(self, X, **kwds):
         """Alias for normalized BDM
 
diff --git a/setup.cfg b/setup.cfg
index 99ea8cd..fef81e6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -26,3 +26,4 @@ doctest_plus = enabled
 filterwarnings =
     ignore:CTM dataset does not contain object
     ignore:Using or importing the ABCs
+    
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index 4898c9b..c002d9f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,7 @@
 """*PyTest* configuration and general purpose fixtures."""
 import pytest
 from pybdm import BDM
-from pybdm.partitions import PartitionRecursive
+from pybdm.partitions import PartitionCorrelated, PartitionRecursive
 
 
 def pytest_addoption(parser):
@@ -51,3 +51,7 @@ def bdm_d2():
 @pytest.fixture(scope='session')
 def bdm_d1_b9():
     return BDM(ndim=1, nsymbols=9, partition=PartitionRecursive, min_length=1)
+
+@pytest.fixture(scope='session')
+def bdm_d1_collapse3():
+    return BDM(ndim=1, partition=PartitionCorrelated, shift=3)
diff --git a/tests/test_bdm.py b/tests/test_bdm.py
index 681ed3b..2c5654c 100755
--- a/tests/test_bdm.py
+++ b/tests/test_bdm.py
@@ -162,6 +162,14 @@ def test_nent_d2(self, bdm_d2, X, expected):
         output = bdm_d2.nent(X)
         assert output == approx(expected)
 
+    @pytest.mark.parametrize('X,Y,expected', [
+        (array_from_string('010101010101010101111', shape=(21,)), array_from_string('010', shape=(3,)), 14.071500815885443),
+        (array_from_string('010101010101010101111', shape=(21,)), array_from_string('000', shape=(3,)), 19.57688365108973),
+    ])
+    def test_conditional_bdm(self, bdm_d1_collapse3, X, Y, expected):
+        output = bdm_d1_collapse3.conditional_bdm(X, Y)
+        assert output == approx(expected)
+
     @pytest.mark.slow
     def test_bdm_parallel(self, bdm_d2):
         X = np.ones((500, 500), dtype=int)
diff --git a/tox.ini b/tox.ini
index 302d0be..9aa7cb9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,6 +5,11 @@
 [tox]
 envlist = py35, py36, py37, style, docs
 
+[pytest]
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    serial
+
 [testenv]
 setenv =
     PYTHONPATH = {toxinidir}:{toxinidir}/pybdm

From 71461aa08396034e7cf81cddc11c4dcfad97296e Mon Sep 17 00:00:00 2001
From: Jirawat I <nodtem66@gmail.com>
Date: Mon, 19 Apr 2021 13:33:51 +0700
Subject: [PATCH 2/2] fixed lints and docs

---
 docs/conf.py               |  1 +
 docs/references.bib        | 11 +++++++++++
 pybdm/algorithms.py        |  2 +-
 pybdm/bdm.py               | 29 ++++++++++++++---------------
 pybdm/options.py           |  4 ++--
 pybdm/utils.py             |  2 +-
 tests/test_bdm.py          | 12 ++++++++++--
 tests/test_partitions.py   |  2 +-
 tests/test_perturbation.py |  4 ++--
 9 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 462b62a..7636cfe 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -39,6 +39,7 @@
     'sphinx.ext.mathjax',
     'sphinxcontrib.bibtex'
 ]
+bibtex_bibfiles = ['references.bib']
 
 
 # Napoleon settings
diff --git a/docs/references.bib b/docs/references.bib
index 18f87a2..7148746 100644
--- a/docs/references.bib
+++ b/docs/references.bib
@@ -210,3 +210,14 @@ @article{morzy_measuring_2017
 	file = {Morzy i in. - 2017 - On Measuring the Complexity of Networks Kolmogoro.pdf:/home/sztal/Zotero/storage/JEWZ5RI6/Morzy i in. - 2017 - On Measuring the Complexity of Networks Kolmogoro.pdf:application/pdf}
 }
 
+@article{hernndez_ml_2021,
+  doi = {10.3389/frai.2020.567356},
+  url = {https://doi.org/10.3389/frai.2020.567356},
+  year = {2021},
+  month = jan,
+  publisher = {Frontiers Media {SA}},
+  volume = {3},
+  author = {Santiago Hern{\'{a}}ndez-Orozco and Hector Zenil and J\"{u}rgen Riedel and Adam Uccello and Narsis A. Kiani and Jesper Tegn{\'{e}}r},
+  title = {Algorithmic Probability-Guided Machine Learning on Non-Differentiable Spaces},
+  journal = {Frontiers in Artificial Intelligence}
+}
diff --git a/pybdm/algorithms.py b/pybdm/algorithms.py
index 694440e..86cbe61 100644
--- a/pybdm/algorithms.py
+++ b/pybdm/algorithms.py
@@ -258,7 +258,7 @@ def run(self, idx=None, values=None, keep_changes=False):
         """
         if idx is None:
             indexes = [ range(k) for k in self.X.shape ]
-            idx = np.array([ x for x in product(*indexes) ], dtype=int)
+            idx = np.array(list(product(*indexes)), dtype=int)
         if values is None:
             values = np.full((idx.shape[0], ), -1, dtype=int)
         return np.apply_along_axis(
diff --git a/pybdm/bdm.py b/pybdm/bdm.py
index 4bd6023..67af1d0 100644
--- a/pybdm/bdm.py
+++ b/pybdm/bdm.py
@@ -129,21 +129,21 @@ def __init__(self, ndim, nsymbols=2, shape=None, partition=PartitionIgnore,
         self.ndim = ndim
         try:
             self.ctmname = ctmname if ctmname else self._ndim_to_ctm[(ndim, nsymbols)]
-        except KeyError:
+        except KeyError as key_error:
             msg = "no CTM dataset for 'ndim={}' and 'nsymbols={}'".format(
                 ndim, nsymbols
             )
-            raise CTMDatasetNotFoundError(msg)
+            raise CTMDatasetNotFoundError(msg) from key_error
         try:
             nsymbols, _shape = self.ctmname.split('-')[-2:]
-        except ValueError:
+        except ValueError as value_error:
             msg = "incorrect 'ctmname'; it should be in format " + \
                 "'name-b<nsymbols>-d<shape>'"
-            raise BDMConfigurationError(msg)
+            raise BDMConfigurationError(msg) from value_error
         self.nsymbols = int(nsymbols[1:])
         if shape is None:
             shape = tuple(int(x) for x in _shape[1:].split('x'))
-        if any([ x != shape[0] for x in shape ]):
+        if any(x != shape[0] for x in shape):
             raise BDMConfigurationError("'shape' has to be equal in each dimension")
         ctm, ctm_missing = get_ctm_dataset(self.ctmname)
         self._ctm = ctm
@@ -399,8 +399,7 @@ def bdm(self, X, normalized=False, check_data=True):
         return cmx
 
     def conditional_bdm(self, X, Y, min_length=0, check_data=True):
-        """Approximate complexity of a Coarse Conditional BDM(x|y)
-        [Hernández-Orozco S, Zenil H, (2021)](doi:10.3389/frai.2020.567356)
+        """Approximate complexity of a Coarse Conditional BDM(x|y) :cite:`hernndez_ml_2021`
 
         Parameters
         ----------
@@ -453,12 +452,12 @@ def conditional_bdm(self, X, Y, min_length=0, check_data=True):
         if check_data:
             self._check_data(X)
             self._check_data(Y)
-        
-        # Backup previous value of shape in partition algorithm 
+
+        # Backup previous value of shape in partition algorithm
         old_shape = self.partition.shape
         # Find new minimal shape
         shape = list(old_shape)
-        for i in range(0, len(old_shape)):
+        for i, _ in enumerate(old_shape):
             shape[i] = min(min_length if min_length > 0 else old_shape[i], Y.shape[i])
         # use new shape in partition algorithm
         self.partition.shape = tuple(shape)
@@ -469,7 +468,7 @@ def conditional_bdm(self, X, Y, min_length=0, check_data=True):
         adjDiff = Counter()
         for key, count in adjX.items():
             if key not in adjY:
-                adjDiff[key] = count    
+                adjDiff[key] = count
         # Restore previous value of shape in partition algorithm
         self.partition.shape = old_shape
         # Calculate the BDM(x|y)
@@ -477,10 +476,10 @@ def conditional_bdm(self, X, Y, min_length=0, check_data=True):
         if self.raise_if_zero and options.get('raise_if_zero') and cmx == 0:
             raise ValueError("Computed BDM is 0, dataset may have incorrect dimensions")
         return cmx
-    
+
     def compute_f_of_intersect(self, adjX, adjY):
         """Compute additional information f(n_xi, n_yi) based on Coarse Conditional BDM(x|y).
-        [Hernández-Orozco S, Zenil H, (2021)](doi:10.3389/frai.2020.567356)
+        :cite:`hernndez_ml_2021`
 
         Parameters
         ----------
@@ -512,7 +511,7 @@ def compute_f_of_intersect(self, adjX, adjY):
             if elem in adjY and adjY[elem] != count:
                 bdm += log2(count)
         return bdm
-    
+
     def nbdm(self, X, **kwds):
         """Alias for normalized BDM
 
@@ -629,7 +628,7 @@ def _check_data(self, X):
             raise ValueError("'X' has more than {} unique symbols".format(
                 self.nsymbols
             ))
-        valid_symbols = np.array([ _ for _ in range(self.nsymbols) ])
+        valid_symbols = np.arange(self.nsymbols)
         bad_symbols = symbols[~np.isin(symbols, valid_symbols)]
         if bad_symbols.size > 0:
             raise ValueError("'X' contains symbols outside of [0, {}]: {}".format(
diff --git a/pybdm/options.py b/pybdm/options.py
index 73c0ea8..f953091 100644
--- a/pybdm/options.py
+++ b/pybdm/options.py
@@ -59,5 +59,5 @@ def get(name=None):
         return _options.copy()
     try:
         return _options[name]
-    except KeyError:
-        raise KeyError("there is no '{}' option".format(name))
+    except KeyError as key_error:
+        raise KeyError("there is no '{}' option".format(name)) from key_error
diff --git a/pybdm/utils.py b/pybdm/utils.py
index 922ecd6..cce3fe8 100644
--- a/pybdm/utils.py
+++ b/pybdm/utils.py
@@ -150,7 +150,7 @@ def list_ctm_datasets():
     >>> list_ctm_datasets()
     ['CTM-B2-D12', 'CTM-B2-D4x4', 'CTM-B4-D12', 'CTM-B5-D12', 'CTM-B6-D12', 'CTM-B9-D12']
     """
-    return [ x for x in sorted(_ctm_datasets.keys()) ]
+    return list(sorted(_ctm_datasets.keys()))
 
 @lru_cache(maxsize=2**int(np.ceil(np.log2(len(_ctm_datasets)))))
 def get_ctm_dataset(name):
diff --git a/tests/test_bdm.py b/tests/test_bdm.py
index 2c5654c..ffc1ed7 100755
--- a/tests/test_bdm.py
+++ b/tests/test_bdm.py
@@ -163,8 +163,16 @@ def test_nent_d2(self, bdm_d2, X, expected):
         assert output == approx(expected)
 
     @pytest.mark.parametrize('X,Y,expected', [
-        (array_from_string('010101010101010101111', shape=(21,)), array_from_string('010', shape=(3,)), 14.071500815885443),
-        (array_from_string('010101010101010101111', shape=(21,)), array_from_string('000', shape=(3,)), 19.57688365108973),
+        (
+            array_from_string('010101010101010101111', shape=(21,)),
+            array_from_string('010', shape=(3,)),
+            14.071500815885443
+        ),
+        (
+            array_from_string('010101010101010101111', shape=(21,)),
+            array_from_string('000', shape=(3,)),
+            19.57688365108973
+        ),
     ])
     def test_conditional_bdm(self, bdm_d1_collapse3, X, Y, expected):
         output = bdm_d1_collapse3.conditional_bdm(X, Y)
diff --git a/tests/test_partitions.py b/tests/test_partitions.py
index dfe5044..faa5c8a 100644
--- a/tests/test_partitions.py
+++ b/tests/test_partitions.py
@@ -5,7 +5,7 @@
 
 
 def _test_decompose(partition, X, expected):
-    output = [ p for p in partition.decompose(X) ]
+    output = list(partition.decompose(X))
     assert len(output) == len(expected)
     assert all(np.array_equal(o, e) for o, e in zip(output, expected))
 
diff --git a/tests/test_perturbation.py b/tests/test_perturbation.py
index b625d8f..72d0d64 100644
--- a/tests/test_perturbation.py
+++ b/tests/test_perturbation.py
@@ -82,7 +82,7 @@ class TestPerturbationExperiment:
     ])
     def test_idx_to_parts(self, perturbation, idx, expected):
         expected = [ perturbation.X[s] for s in expected ]
-        output = [ x for x in perturbation._idx_to_parts(idx) ]
+        output = list(perturbation._idx_to_parts(idx))
         assert len(output) == len(expected)
         for o, e in zip(output, expected):
             assert np.array_equal(o, e)
@@ -104,7 +104,7 @@ def test_idx_to_parts(self, perturbation, idx, expected):
     def test_idx_to_parts_overlap(self, perturbation_overlap, idx,  expected):
         perturbation = perturbation_overlap
         expected = [ perturbation.X[s] for s in expected ]
-        output = [ x for x in perturbation._idx_to_parts(idx) ]
+        output = list(perturbation._idx_to_parts(idx))
         assert len(output) == len(expected)
         for o, e in zip(output, expected):
             assert np.array_equal(o, e)