Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add conditional_bdm to compute BDM(x|y) #16

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
'sphinx.ext.mathjax',
'sphinxcontrib.bibtex'
]
bibtex_bibfiles = ['references.bib']


# Napoleon settings
Expand Down
11 changes: 11 additions & 0 deletions docs/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,14 @@ @article{morzy_measuring_2017
file = {Morzy i in. - 2017 - On Measuring the Complexity of Networks Kolmogoro.pdf:/home/sztal/Zotero/storage/JEWZ5RI6/Morzy i in. - 2017 - On Measuring the Complexity of Networks Kolmogoro.pdf:application/pdf}
}

@article{hernndez_ml_2021,
doi = {10.3389/frai.2020.567356},
url = {https://doi.org/10.3389/frai.2020.567356},
year = {2021},
month = jan,
publisher = {Frontiers Media {SA}},
volume = {3},
author = {Santiago Hern{\'{a}}ndez-Orozco and Hector Zenil and J\"{u}rgen Riedel and Adam Uccello and Narsis A. Kiani and Jesper Tegn{\'{e}}r},
title = {Algorithmic Probability-Guided Machine Learning on Non-Differentiable Spaces},
journal = {Frontiers in Artificial Intelligence}
}
2 changes: 1 addition & 1 deletion pybdm/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def run(self, idx=None, values=None, keep_changes=False):
"""
if idx is None:
indexes = [ range(k) for k in self.X.shape ]
idx = np.array([ x for x in product(*indexes) ], dtype=int)
idx = np.array(list(product(*indexes)), dtype=int)
if values is None:
values = np.full((idx.shape[0], ), -1, dtype=int)
return np.apply_along_axis(
Expand Down
126 changes: 120 additions & 6 deletions pybdm/bdm.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,21 +129,21 @@ def __init__(self, ndim, nsymbols=2, shape=None, partition=PartitionIgnore,
self.ndim = ndim
try:
self.ctmname = ctmname if ctmname else self._ndim_to_ctm[(ndim, nsymbols)]
except KeyError:
except KeyError as key_error:
msg = "no CTM dataset for 'ndim={}' and 'nsymbols={}'".format(
ndim, nsymbols
)
raise CTMDatasetNotFoundError(msg)
raise CTMDatasetNotFoundError(msg) from key_error
try:
nsymbols, _shape = self.ctmname.split('-')[-2:]
except ValueError:
except ValueError as value_error:
msg = "incorrect 'ctmname'; it should be in format " + \
"'name-b<nsymbols>-d<shape>'"
raise BDMConfigurationError(msg)
raise BDMConfigurationError(msg) from value_error
self.nsymbols = int(nsymbols[1:])
if shape is None:
shape = tuple(int(x) for x in _shape[1:].split('x'))
if any([ x != shape[0] for x in shape ]):
if any(x != shape[0] for x in shape):
raise BDMConfigurationError("'shape' has to be equal in each dimension")
ctm, ctm_missing = get_ctm_dataset(self.ctmname)
self._ctm = ctm
Expand Down Expand Up @@ -398,6 +398,120 @@ def bdm(self, X, normalized=False, check_data=True):
cmx = (cmx - min_cmx) / (max_cmx - min_cmx)
return cmx

def conditional_bdm(self, X, Y, min_length=0, check_data=True):
"""Approximate complexity of a Coarse Conditional BDM(x|y) :cite:`hernndez_ml_2021`

Parameters
----------
X : array_like
Dataset representation as a :py:class:`numpy.ndarray`.
Number of axes must agree with the `ndim` attribute.
Y : array_like
Dataset representation as a :py:class:`numpy.ndarray`.
Number of axes must agree with the `ndim` attribute.
min_length : int
Minimum parts' length. Non-negative.
In case of multidimensional objects it specifies minimum
length of any single dimension.
Default of 0 will use the min(X.shape,Y.shape)
check_data : bool
Should data format be checked.
May be disabled to gain some speed when calling multiple times.

Returns
-------
float
Approximate conditional algorithmic complexity K(x|y).

Raises
------
TypeError
If `X` or `Y` is not an integer array and `check_data=True`.
ValueError
If `X` or `Y` has more than `nsymbols` unique values
and `check_data=True`.
ValueError
If `X` or `Y` has symbols outside of the ``0`` to `nsymbols-1` range
and `check_data=True`.
ValueError
If computed BDM value is 0 and `raise_if_zero` is ``True``.

Notes
-----
Detailed description can be found in :doc:`theory`.

Examples
--------
>>> import numpy as np
>>> bdm = BDM(ndim=1, partition=PartitionCorrelated, shift=3)
>>> X = encoding.array_from_string('010101010101010101111', (21,))
>>> Y = encoding.array_from_string('010', (3,))
>>> bdm.conditional_bdm(X, Y) # doctest: +FLOAT_CMP
14.071500815885443
"""
if check_data:
self._check_data(X)
self._check_data(Y)

# Backup previous value of shape in partition algorithm
old_shape = self.partition.shape
# Find new minimal shape
shape = list(old_shape)
for i, _ in enumerate(old_shape):
shape[i] = min(min_length if min_length > 0 else old_shape[i], Y.shape[i])
# use new shape in partition algorithm
self.partition.shape = tuple(shape)
# Find adjX and adjY
adjX = self.decompose_and_count(X)
adjY = self.decompose_and_count(Y)
# Find set difference from adjX and adjY
adjDiff = Counter()
for key, count in adjX.items():
if key not in adjY:
adjDiff[key] = count
# Restore previous value of shape in partition algorithm
self.partition.shape = old_shape
# Calculate the BDM(x|y)
cmx = self.compute_bdm(adjDiff) + self.compute_f_of_intersect(adjX, adjY)
if self.raise_if_zero and options.get('raise_if_zero') and cmx == 0:
raise ValueError("Computed BDM is 0, dataset may have incorrect dimensions")
return cmx

def compute_f_of_intersect(self, adjX, adjY):
"""Compute additional information f(n_xi, n_yi) based on Coarse Conditional BDM(x|y).
:cite:`hernndez_ml_2021`

Parameters
----------
*counters :
Counter objects grouping object keys and occurences.

Returns
-------
float
f(n_xi, n_yi)

Notes
-----
Detailed description can be found in :doc:`theory`.

Examples
--------
>>> from collections import Counter
>>> bdm = BDM(ndim=1)
>>> c1 = Counter([('111111111111', 1.95207842085224e-08), ('111111111111', 1.95207842085224e-08)])
>>> c2 = Counter([('111111111111', 1.95207842085224e-08)])
>>> bdm.compute_f_of_intersect(c1, c2) # doctest: +FLOAT_CMP
1.0
"""
if not isinstance(adjX, Counter) or not isinstance(adjY, Counter):
return NotImplemented
bdm = 0
for elem, count in adjX.items():
if elem in adjY and adjY[elem] != count:
bdm += log2(count)
return bdm

def nbdm(self, X, **kwds):
"""Alias for normalized BDM

Expand Down Expand Up @@ -514,7 +628,7 @@ def _check_data(self, X):
raise ValueError("'X' has more than {} unique symbols".format(
self.nsymbols
))
valid_symbols = np.array([ _ for _ in range(self.nsymbols) ])
valid_symbols = np.arange(self.nsymbols)
bad_symbols = symbols[~np.isin(symbols, valid_symbols)]
if bad_symbols.size > 0:
raise ValueError("'X' contains symbols outside of [0, {}]: {}".format(
Expand Down
4 changes: 2 additions & 2 deletions pybdm/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,5 +59,5 @@ def get(name=None):
return _options.copy()
try:
return _options[name]
except KeyError:
raise KeyError("there is no '{}' option".format(name))
except KeyError as key_error:
raise KeyError("there is no '{}' option".format(name)) from key_error
2 changes: 1 addition & 1 deletion pybdm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def list_ctm_datasets():
>>> list_ctm_datasets()
['CTM-B2-D12', 'CTM-B2-D4x4', 'CTM-B4-D12', 'CTM-B5-D12', 'CTM-B6-D12', 'CTM-B9-D12']
"""
return [ x for x in sorted(_ctm_datasets.keys()) ]
return list(sorted(_ctm_datasets.keys()))

@lru_cache(maxsize=2**int(np.ceil(np.log2(len(_ctm_datasets)))))
def get_ctm_dataset(name):
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ doctest_plus = enabled
filterwarnings =
ignore:CTM dataset does not contain object
ignore:Using or importing the ABCs

6 changes: 5 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""*PyTest* configuration and general purpose fixtures."""
import pytest
from pybdm import BDM
from pybdm.partitions import PartitionRecursive
from pybdm.partitions import PartitionCorrelated, PartitionRecursive


def pytest_addoption(parser):
Expand Down Expand Up @@ -51,3 +51,7 @@ def bdm_d2():
@pytest.fixture(scope='session')
def bdm_d1_b9():
return BDM(ndim=1, nsymbols=9, partition=PartitionRecursive, min_length=1)

@pytest.fixture(scope='session')
def bdm_d1_collapse3():
return BDM(ndim=1, partition=PartitionCorrelated, shift=3)
16 changes: 16 additions & 0 deletions tests/test_bdm.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,22 @@ def test_nent_d2(self, bdm_d2, X, expected):
output = bdm_d2.nent(X)
assert output == approx(expected)

@pytest.mark.parametrize('X,Y,expected', [
(
array_from_string('010101010101010101111', shape=(21,)),
array_from_string('010', shape=(3,)),
14.071500815885443
),
(
array_from_string('010101010101010101111', shape=(21,)),
array_from_string('000', shape=(3,)),
19.57688365108973
),
])
def test_conditional_bdm(self, bdm_d1_collapse3, X, Y, expected):
output = bdm_d1_collapse3.conditional_bdm(X, Y)
assert output == approx(expected)

@pytest.mark.slow
def test_bdm_parallel(self, bdm_d2):
X = np.ones((500, 500), dtype=int)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_partitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


def _test_decompose(partition, X, expected):
output = [ p for p in partition.decompose(X) ]
output = list(partition.decompose(X))
assert len(output) == len(expected)
assert all(np.array_equal(o, e) for o, e in zip(output, expected))

Expand Down
4 changes: 2 additions & 2 deletions tests/test_perturbation.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ class TestPerturbationExperiment:
])
def test_idx_to_parts(self, perturbation, idx, expected):
expected = [ perturbation.X[s] for s in expected ]
output = [ x for x in perturbation._idx_to_parts(idx) ]
output = list(perturbation._idx_to_parts(idx))
assert len(output) == len(expected)
for o, e in zip(output, expected):
assert np.array_equal(o, e)
Expand All @@ -104,7 +104,7 @@ def test_idx_to_parts(self, perturbation, idx, expected):
def test_idx_to_parts_overlap(self, perturbation_overlap, idx, expected):
perturbation = perturbation_overlap
expected = [ perturbation.X[s] for s in expected ]
output = [ x for x in perturbation._idx_to_parts(idx) ]
output = list(perturbation._idx_to_parts(idx))
assert len(output) == len(expected)
for o, e in zip(output, expected):
assert np.array_equal(o, e)
Expand Down
5 changes: 5 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
[tox]
envlist = py35, py36, py37, style, docs

[pytest]
markers =
slow: marks tests as slow (deselect with '-m "not slow"')
serial

[testenv]
setenv =
PYTHONPATH = {toxinidir}:{toxinidir}/pybdm
Expand Down