diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4ba787e --- /dev/null +++ b/.gitignore @@ -0,0 +1,114 @@ +# Sandbox folders +_sandbox/** +_notebooks/** + +# IDEs +.vscode + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports / benchmarks / profiling +prof/ +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ +.benchmarks + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# VIM +**/*.swp diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..a366809 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,10 @@ +Authors +------- + +**Development Lead** + +* Szymon Talaga + +**Contributors** + +None yet. Why not be the first? diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst new file mode 100644 index 0000000..4b5f157 --- /dev/null +++ b/CONTRIBUTING.rst @@ -0,0 +1,111 @@ +============ +Contributing +============ + +Contributions are welcome, and they are greatly appreciated! Every +little bit helps, and credit will always be given. + +You can contribute in many ways: + +Types of Contributions +---------------------- + +Report Bugs +~~~~~~~~~~~ + +Report bugs at https://github.com/sztal/pathcensus/issues. + +If you are reporting a bug, please include: + +* Your operating system name and version. +* Any details about your local setup that might be helpful in troubleshooting. +* Detailed steps to reproduce the bug. + +Fix Bugs +~~~~~~~~ + +Look through the GitHub issues for bugs. Anything tagged with "bug" +is open to whoever wants to implement it. + +Implement Features +~~~~~~~~~~~~~~~~~~ + +Look through the GitHub issues for features. Anything tagged with "feature" +is open to whoever wants to implement it. + +Write Documentation +~~~~~~~~~~~~~~~~~~~ + +Path census could always use more documentation, whether as part of the +official Path census docs, in docstrings, or even on the web in blog posts, +articles, and such. + +Submit Feedback +~~~~~~~~~~~~~~~ + +The best way to send feedback is to file an issue at https://github.com/sztal/pathcensus/issues. + +If you are proposing a feature: + +* Explain in detail how it would work. +* Keep the scope as narrow as possible, to make it easier to implement. +* Remember that this is a volunteer-driven project, and that contributions + are welcome :) + +Get Started! +------------ + +Ready to contribute? Here's how to set up `pathcensus` for +local development. + +1. Fork_ the `pathcensus` repo on GitHub. +2. Clone your fork locally:: + + $ git clone git@github.com:your_name_here/pathcensus.git + +3. Create a branch for local development:: + + $ git checkout -b name-of-your-bugfix-or-feature + +Now you can make your changes locally. + +4. When you're done making changes, check that your changes pass style and unit + tests, including testing other Python versions with tox:: + + $ make test-all + +You may first need to run ``pip install -r requirements-tests.txt`` +and then ``pip install tox``. + +5. Commit your changes and push your branch to GitHub:: + + $ git add . + $ git commit -m "Your detailed description of your changes." + $ git push origin name-of-your-bugfix-or-feature + +6. Submit a pull request through the GitHub website. + +.. _Fork: https://github.com/sztal/pathcensus/fork + +Pull Request Guidelines +----------------------- + +Before you submit a pull request, check that it meets these guidelines: + +1. The pull request should include tests. +2. If the pull request adds functionality, the docs should be updated. Put + your new functionality into a function with a docstring, and add the + feature to the list in README.rst. +3. The pull request should work for Python 3.8+ + and must pass all tests when running ``make test-all``. + + +Tips +---- + +To run a subset of tests:: + + $ pytest test/unweighted/test_pathcensus.py + +This will run all tests for the unweighted calculations of path census +and structural coefficients. diff --git a/HISTORY.rst b/HISTORY.rst new file mode 100644 index 0000000..4498769 --- /dev/null +++ b/HISTORY.rst @@ -0,0 +1,9 @@ +.. :changelog: + +History +------- + +0.1 (2022-06-01) +++++++++++++++++++ + +* First release on PyPI. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b62d956 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +The MIT License (MIT) + +Copyright (c) 2021 Szymon Talaga + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c03f5d7 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include AUTHORS.rst +include HISTORY.rst +include LICENSE +include README.rst diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c844ccf --- /dev/null +++ b/Makefile @@ -0,0 +1,72 @@ +.PHONY: help clean clean-pyc clean-build clean-test lint test test-all coverage docs release sdist + +help: + @echo "clean - remove auxiliary files/artifacts" + @echo "clean-build - remove build artifacts" + @echo "clean-pyc - remove Python file artifacts" + @echo "clean-test - remove testing artifacts" + @echo "lint - check style with flake8" + @echo "test - run tests quickly with the default Python" + @echo "test-all - run tests on every Python version with tox" + @echo "coverage - compute code coverage quickly with the default Python" + @echo "cov-report - display coverage report" + @echo "docs - generate Sphinx HTML documentation, including API docs" + @echo "release - package and upload a release" + @echo "sdist - package" + +clean: clean-build clean-py clean-test + +clean-build: + rm -fr build/ + rm -fr dist/ + find . -name '*.egg-info' -exec rm -rf {} + + find . -name '*.eggs' -exec rm -rf {} + + +clean-py: + find . -name '__pycache__' -exec rm -rf {} + + find . -name '*.pyc' -exec rm -f {} + + find . -name '*.pyo' -exec rm -f {} + + find . -name '*.nbc' -exec rm -f {} + + find . -name '*.nbi' -exec rm -f {} + + find . -name '*~' -exec rm -f {} + + +clean-test: + find . -name '.benchmarks' -exec rm -rf {} + + find . -name '.pytest_cache' -exec rm -rf {} + + find . -name '.tox' -exec rm -rf {} + + +lint: + pytest --pylint -m pylint + +test: + pytest + +test-all: + tox + +cov-run: + coverage run --source pathcensus setup.py test + +cov-report: + coverage report --omit=pathcensus/core/*.py + coverage html --omit=pathcensus/core/*.py + xdg-open htmlcov/index.html || open htmlcov/index.html + +coverage: cov-run cov-report + +docs: + rm -f docs/pathcensus.rst + rm -f docs/modules.rst + sphinx-apidoc -o docs/ pathcensus + $(MAKE) -C docs clean + $(MAKE) -C docs html + xdg-open docs/_build/html/index.html || open docs/_build/html/index.html + +release: clean + python setup.py sdist upload + python setup.py bdist_wheel upload + +sdist: clean + python setup.py sdist + python setup.py bdist_wheel upload + ls -l dist diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..d3d30b7 --- /dev/null +++ b/README.rst @@ -0,0 +1,349 @@ +============================= +``pathcensus`` package +============================= + +.. image:: https://badge.fury.io/py/pathcensus.png + :target: http://badge.fury.io/py/pathcensus + +.. image:: https://travis-ci.org/sztal/pathcensus.png?branch=master + :target: https://travis-ci.org/sztal/pathcensus + + +Welcome to the documentation of ``pathcensus`` package. +It is a Python (3.8+) implementation of **structural similarity and +complementarity coefficients** for undirected (un)weighted networks based +on efficient counting of 2- and 3-paths (triples and quadruples) +and 3- and 4-cycles (triangles and quadrangles). + +**Structural coefficients are graph-theoretic +measures of the extent to which relations at different levels +(of edges, nodes or entire networks) are driven by similarity or +complementarity between different nodes**. Even though they are defined +in purely combinatorial manner they are motivated by geometric arguments +which link them to the family of latent space/random geometric graph models. +In particular, the geometric view allow the identification of network motifs +charactersitic for similarity (triangles) and complementarity (quadrangles). +They can be seen as a generalization of the well-known +local and global clustering coefficients which summarize the structure +of a network in terms of density of ego subgraph(s). + +Even though it is a Python package ``pathcensus`` is performant as its main +workhorse functions are just-in-time (JIT) compiled to efficient C code +thanks to the `numba`_ library. It is compatible with `numpy`_ +arrays and `scipy`_ sparse matrices making it easy to use in practice. +Moreover, it allows registering graph classes implemented by different +third-party packages such as `networkx`_ so they can be converted +automatically to sparse matrices. Conversion methods for `networkx`_, +`igraph`_ and `graph-tool`_ are registered automatically +provided the packages are installed. + +For the sake of convenience ``pathcensus`` provides also implementations +of most appropriate null models for statistical calibration of structural +coefficients which are simple wrappers around the excellent `NEMtropy`_ +package. It also defines the ``pathcensus.inference`` submodule with +utility class for facilitating approximate statistical inference based on +sampling from null models. + +See ``examples`` subfolder and the main documentation for more details. + +At the command line via pip + +.. code-block:: + + # Not yet on PyPI + pip install pathcensus + +The current (unstable) development version can be installed +directly from the `github repo`_ + +.. code-block:: + + pip install git+ssh://git@github.com/sztal/pathcensus.git + + +How to cite? +============ + +You find the package useful? Please cite our work properly. + +**Main theory paper** + + Talaga, S., & Nowak, A. (2022). Structural complementarity and similarity: + linking relational principles to network structure. arXiv preprint arXiv:2201.03664. + + +Usage +===== + +**NOTE** + + Main internal functions for calculating path census are JIT-compiled + when used for the first time. Thus, the first initialization of a + ``PathCensus`` object may be quite slow as its execution time will include + the time required for compilation. However, this happens only once. + +We will use `igraph`_ to generate graphs used in examples. However, even though +it is automatically integrated with ``pathcensus``, `igraph`_ is not +a dependency. + +.. code-block:: python + + # Main imports used in the examples below + import random + import numpy as np + import igraph as ig + from pathcensus import PathCensus + + # Set random and numpy rng seeds + random.seed(303) + np.random.seed(101) + +More detailed examples can be found in the official documentation. + + +Path census & structural coefficients +------------------------------------- + +Path census is a set of counts of different paths and cycles per edge, node +or in the entire graph. The counts are subsequently used to calculate different +kinds of structural coefficients. + +.. code-block:: python + + # Generate simple undirected ER random graph + G = ig.Graph.Erdos_Renyi(100, p=.05, directed=False) + # Initialize path census object. + # it precomputed path/cycle counts at the level of edges. + # Other counts are derived from them. + P = PathCensus(G) + + # Get edge-level census + P.census("edges") + # Get node-level census + P.census("nodes") # or just P.census() + # Get global census + P.census("global") + + # Column definitions + ?P.definitions + +Once path census is computed it can be used to calculate structural +coefficients. + +.. code-block:: python + + # Similarity coefficients + P.tclust() # triangle-clustering equivalent to local clustering coefficient + P.tclosure() # triangle-closure equivalent to local closure coefficient + P.similarity() # structural similarity (weighted average of clustering and closure) + + # Edge-wise similarity + P.similarity("edges") + # Global similarity (equivalent to global clustering coefficient) + P.similarity("global") + +The figure below sums up the design of structural similarity coefficients, +their geometric motivation and some of the main properties. + +.. image:: /docs/figures/sim.svg + :align: center + + +.. code-block:: python + + # Complementarity coefficients + P.qclust() # quadrangle-based clustering + P.qclosure() # quadrangle-based closure + P.complementarity() # structural complementarity (weighted average of clustering and closure) + + # Edge-wise complementarity + P.complementarity("edges") + # Global complementarity + P.complementarity("global") + +The figure below sums up the design and the geometric motivation of +complementarity coefficients as well as their main properties. + +.. image:: /docs/figures/comp.svg + :align: center + +Similarity and/or complementarity coefficients may be calculated in one +go using appropriate methods as shown below. + +.. code-block:: python + + # Similarity + corresponding clustering and closure coefs + P.simcoefs() # node-wise + P.simcoefs("global") # global + + # Complementarity + corresponding clustering and closure coefs + P.compcoefs() # node-wise + P.compcoefs("global") # global + + # All coefficients + P.coefs() + # All coefficients + full path census + P.coefs(census=True) + + +Weighted coefficients +--------------------- + +Below we create an ER random graph with random integer edge weights +between 1 and 10. As long as edge weights are assigned to an edge property +of the standard name (``"weight"``) they should be detected automatically +and ``pathcensus`` will calculate weighted census. However, unweighted census +may be enforced by using ``weighted=False``. + +.. code-block:: python + + G = ig.Graph.Erdos_Renyi(100, p=0.05, directed=False) + G.es["weight"] = np.random.randint(1, 11, G.ecount()) + + P = PathCensus(G) + P.weighted # True + # Get all coefficients and full path census + P.coefs(census=True) + + # Use unweighted census + P = PathCensus(G, weighted=False) + P.weighted # False + P.coefs(census=True) + +Below is the summary of the construction of weighted coefficients. + +.. image:: /docs/figures/weighted.svg + :align: center + + +Parallel ``PathCensus`` algorithm +--------------------------------- + +``PathCensus`` objects may be initialized using parallelized algorithms +by using ``parallel=True``. + +**NOTE** + + Parallel algorithms require an extra compilation step so the first + time ``parallel=True`` is used there will be a significant extra + overhead. + +.. code-block:: python + + # By default all available threads are used + P = PathCensus(G, parallel=True) + + # Use specific number of threads + P = PathCensus(G, parallel=True, num_threads=2) + + +Other features +============== + +Other main features of ``pathcensus`` are: + +#. Null models based on the ERGM family. +#. Utilities for conducting statistical inference based on null models. +#. Integration with arbitrary classes of graph-like objects. + +All these features are documented in the official documentation. + + +Testing +======= + +The repository with the package source code can be cloned easily +from the `github repo`_. + +.. code-block:: + + git clone git@github.com:sztal/pathcensus.git + +It is recommended to work within an isolated virtual environment. +This can be done easily for instance using `conda`_. +Remember about using a proper Python version (e.g. 3.8 or 3.9). + +.. code-block:: + + conda create --name my-env python=3.8 + conda activate my-env + +After entering the directory in which ``pathcensus`` repository +was cloned it is enough to install the package locally. + +.. code-block:: bash + + pip install . + # Or in developer/editable mode + pip install --editable . + +In order to run tests it is necessary to install also test dependencies. + +.. code-block:: bash + + pip install -r ./requirements-tests.txt + # Now tests can be run + pytest + # Or alternatively + make test + # And to run linter + make lint + +And similarly for building the documentation from source. + +.. code-block:: bash + + pip install -r ./requirements-docs.txt + # Now documentation can be built + make docs + +Tests targeting different Python versions can be run using `tox`_ test +automation framework. You may first need to install `tox`_ +(e.g. ``pip install tox``). + +.. code-block:: bash + + make test-all + # Or alternatively + tox + +Test coverage +------------- + +Unit test coverage report can be generated easily. + +.. code-block:: + + make coverage + # Report can be displayed again after running coverage + make cov-report + + +Feedback +======== + +If you have any suggestions or questions about **Path census** feel free to email me +at stalaga@protonmail.com. + +If you encounter any errors or problems with **Path census**, please let me know! +Open an Issue at the GitHub http://github.com/sztal/pathcensus main repository. + + +Authors +======= + +* Szymon Talaga + + + +.. _github repo: https://github.com/sztal/pathcensus +.. _examples: https://github.com/sztal/pathcensus/tree/master/examples +.. _conda: https://docs.conda.io/en/latest/ +.. _tox: https://tox.wiki/en/latest/ +.. _numpy: https://numpy.org/ +.. _scipy: https://scipy.org/ +.. _numba: https://numba.pydata.org/ +.. _networkx: https://networkx.org/ +.. _igraph: https://igraph.org/python/ +.. _graph-tool: https://graph-tool.skewed.de/ +.. _NEMtropy: https://pypi.org/project/NEMtropy/ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..0e35bee --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,177 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/complexity.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/complexity.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/complexity" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/complexity" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..d80e5d4 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,290 @@ +# -*- coding: utf-8 -*- +# +# complexity documentation build configuration file, created by +# sphinx-quickstart on Tue Jul 9 22:26:36 2013. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.abspath(".")) + +import pathcensus + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", + "sphinx.ext.extlinks", + "sphinx.ext.mathjax", + "sphinx.ext.doctest", + "sphinxcontrib.bibtex", +] + +# Napoleon settings +napoleon_google_docstring = False +napoleon_numpy_docstring = True +napoleon_include_init_with_doc = False +napoleon_include_private_with_doc = False +napoleon_include_special_with_doc = True +napoleon_use_admonition_for_examples = False +napoleon_use_admonition_for_notes = False +napoleon_use_admonition_for_references = False +napoleon_use_ivar = False +napoleon_use_param = True +napoleon_use_rtype = True + +# Intersphinx mapping +intersphinx_mapping = { + "numpy": ("https://numpy.org/doc/stable/", None) +} + +extlinks = { + "numpy": ("https://numpy.org/", None), + "scipy": ("https://scipy.org/", None), + "numba": ("https://numba.pydata.org/", "Numba%s"), + "networkx": ("https://networkx.org/", None), + "igraph": ("https://igraph.org/python/", None), + "graph-tool": ("https://graph-tool.skewed.de/", None), + "NEMtropy": ("https://pypi.org/project/NEMtropy/", None) +} + +# Sphinx-Bibtex +bibtex_bibfiles = ["refs.bib"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix of source filenames. +source_suffix = ".rst" + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = "index" + +# General information about the project. +project = "Path census" +copyright = "2021, Szymon Talaga" + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = pathcensus.__version__ +# The full version, including alpha/beta/rc tags. +release = pathcensus.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ["_build"] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = "sphinx_rtd_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = "pathcensusdoc" + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ("index", "pathcensus.tex", "Path census Documentation", + "Szymon Talaga", "manual"), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ("index", "pathcensus", "Path census Documentation", + ["Szymon Talaga"], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ("index", "pathcensus", "Path census Documentation", + "Szymon Talaga", "pathcensus", "One line description of project.", + "Miscellaneous"), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/figures/comp-weak.svg b/docs/figures/comp-weak.svg new file mode 100644 index 0000000..969060f --- /dev/null +++ b/docs/figures/comp-weak.svg @@ -0,0 +1,563 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/figures/comp.svg b/docs/figures/comp.svg new file mode 100644 index 0000000..778e7ad --- /dev/null +++ b/docs/figures/comp.svg @@ -0,0 +1,1277 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/figures/q-head.svg b/docs/figures/q-head.svg new file mode 100644 index 0000000..dcc544e --- /dev/null +++ b/docs/figures/q-head.svg @@ -0,0 +1,83 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/figures/q-wedge.svg b/docs/figures/q-wedge.svg new file mode 100644 index 0000000..635620f --- /dev/null +++ b/docs/figures/q-wedge.svg @@ -0,0 +1,83 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/figures/sim.svg b/docs/figures/sim.svg new file mode 100644 index 0000000..279ce57 --- /dev/null +++ b/docs/figures/sim.svg @@ -0,0 +1,1081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/figures/t-head.svg b/docs/figures/t-head.svg new file mode 100644 index 0000000..c9277e9 --- /dev/null +++ b/docs/figures/t-head.svg @@ -0,0 +1,111 @@ + +image/svg+xmlk +i +j + \ No newline at end of file diff --git a/docs/figures/t-wedge.svg b/docs/figures/t-wedge.svg new file mode 100644 index 0000000..de09bf2 --- /dev/null +++ b/docs/figures/t-wedge.svg @@ -0,0 +1,111 @@ + +image/svg+xmli +j +k + \ No newline at end of file diff --git a/docs/figures/weighted.svg b/docs/figures/weighted.svg new file mode 100644 index 0000000..29546e8 --- /dev/null +++ b/docs/figures/weighted.svg @@ -0,0 +1,626 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..1e2ca12 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,32 @@ +.. complexity documentation master file, created by + sphinx-quickstart on Tue Jul 9 22:26:36 2013. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. include:: /sections/overview.rst + +Contents +======== + +.. toctree:: + :maxdepth: 1 + + /sections/installation + /sections/coefficients + /sections/nullmodels + /sections/inference + /sections/graphs + /sections/references + /sections/about + /sections/contributing + modules + + +Feedback +======== + +If you have any suggestions or questions about **Path census** feel free to email me +at stalaga@protonmail.com. + +If you encounter any errors or problems with **Path census**, please let me know! +Open an Issue at the GitHub http://github.com/sztal/pathcensus main repository. diff --git a/docs/links.rst b/docs/links.rst new file mode 100644 index 0000000..771f2de --- /dev/null +++ b/docs/links.rst @@ -0,0 +1,11 @@ +.. _github repo: https://github.com/sztal/pathcensus +.. _examples: https://github.com/sztal/pathcensus/tree/master/examples +.. _conda: https://docs.conda.io/en/latest/ +.. _tox: https://tox.wiki/en/latest/ +.. _numpy: https://numpy.org/ +.. _scipy: https://scipy.org/ +.. _numba: https://numba.pydata.org/ +.. _networkx: https://networkx.org/ +.. _igraph: https://igraph.org/python/ +.. _graph-tool: https://graph-tool.skewed.de/ +.. _NEMtropy: https://pypi.org/project/NEMtropy/ diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..2df9a8c --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,242 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\complexity.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\complexity.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %BUILDDIR%/.. + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/docs/modules.rst b/docs/modules.rst new file mode 100644 index 0000000..c31f691 --- /dev/null +++ b/docs/modules.rst @@ -0,0 +1,7 @@ +pathcensus +========== + +.. toctree:: + :maxdepth: 4 + + pathcensus diff --git a/docs/pathcensus.core.rst b/docs/pathcensus.core.rst new file mode 100644 index 0000000..9843651 --- /dev/null +++ b/docs/pathcensus.core.rst @@ -0,0 +1,45 @@ +pathcensus.core package +======================= + +Submodules +---------- + +pathcensus.core.graph module +---------------------------- + +.. automodule:: pathcensus.core.graph + :members: + :undoc-members: + :show-inheritance: + +pathcensus.core.parallel module +------------------------------- + +.. automodule:: pathcensus.core.parallel + :members: + :undoc-members: + :show-inheritance: + +pathcensus.core.random module +----------------------------- + +.. automodule:: pathcensus.core.random + :members: + :undoc-members: + :show-inheritance: + +pathcensus.core.types module +---------------------------- + +.. automodule:: pathcensus.core.types + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: pathcensus.core + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/pathcensus.nullmodels.rst b/docs/pathcensus.nullmodels.rst new file mode 100644 index 0000000..68567d0 --- /dev/null +++ b/docs/pathcensus.nullmodels.rst @@ -0,0 +1,40 @@ +pathcensus.nullmodels package +============================= + +Submodules +---------- + +pathcensus.nullmodels.base module +--------------------------------- + +.. automodule:: pathcensus.nullmodels.base + :members: + :undoc-members: + :show-inheritance: + :noindex: + +pathcensus.nullmodels.ubcm module +--------------------------------- + +.. automodule:: pathcensus.nullmodels.ubcm + :members: + :undoc-members: + :show-inheritance: + :noindex: + +pathcensus.nullmodels.uecm module +--------------------------------- + +.. automodule:: pathcensus.nullmodels.uecm + :members: + :undoc-members: + :show-inheritance: + :noindex: + +Module contents +--------------- + +.. automodule:: pathcensus.nullmodels + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/pathcensus.rst b/docs/pathcensus.rst new file mode 100644 index 0000000..48ccb93 --- /dev/null +++ b/docs/pathcensus.rst @@ -0,0 +1,70 @@ +pathcensus package +================== + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + pathcensus.core + pathcensus.nullmodels + +Submodules +---------- + +pathcensus.definitions module +----------------------------- + +.. automodule:: pathcensus.definitions + :members: + :undoc-members: + :show-inheritance: + +pathcensus.graph module +----------------------- + +.. automodule:: pathcensus.graph + :members: + :undoc-members: + :show-inheritance: + +pathcensus.inference module +--------------------------- + +.. automodule:: pathcensus.inference + :members: + :undoc-members: + :show-inheritance: + +pathcensus.pathcensus module +---------------------------- + +.. automodule:: pathcensus.pathcensus + :members: + :undoc-members: + :show-inheritance: + +pathcensus.types module +----------------------- + +.. automodule:: pathcensus.types + :members: + :undoc-members: + :show-inheritance: + +pathcensus.utils module +----------------------- + +.. automodule:: pathcensus.utils + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: pathcensus + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/refs.bib b/docs/refs.bib new file mode 100644 index 0000000..9400d88 --- /dev/null +++ b/docs/refs.bib @@ -0,0 +1,84 @@ + +@inproceedings{ahmedEfficientGraphletCounting2015, + title = {Efficient {{Graphlet Counting}} for {{Large Networks}}}, + booktitle = {2015 {{IEEE International Conference}} on {{Data Mining}}}, + author = {Ahmed, Nesreen K. and Neville, Jennifer and Rossi, Ryan A. and Duffield, Nick}, + year = {2015}, + pages = {1--10}, + publisher = {{IEEE}}, + address = {{Atlantic City, NJ, USA}}, + doi = {10.1109/ICDM.2015.141}, + abstract = {From social science to biology, numerous applications often rely on graphlets for intuitive and meaningful characterization of networks at both the global macro-level as well as the local micro-level. While graphlets have witnessed a tremendous success and impact in a variety of domains, there has yet to be a fast and efficient approach for computing the frequencies of these subgraph patterns. However, existing methods are not scalable to large networks with millions of nodes and edges, which impedes the application of graphlets to new problems that require largescale network analysis. To address these problems, we propose a fast, efficient, and parallel algorithm for counting graphlets of size k = \{3, 4\}-nodes that take only a fraction of the time to compute when compared with the current methods used. The proposed graphlet counting algorithms leverages a number of proven combinatorial arguments for different graphlets. For each edge, we count a few graphlets, and with these counts along with the combinatorial arguments, we obtain the exact counts of others in constant time. On a large collection of 300+ networks from a variety of domains, our graphlet counting strategies are on average 460x faster than current methods. This brings new opportunities to investigate the use of graphlets on much larger networks and newer applications as we show in the experiments. To the best of our knowledge, this paper provides the largest graphlet computations to date as well as the largest systematic investigation on over 300+ networks from a variety of domains.}, + isbn = {978-1-4673-9504-5}, + langid = {english} +} + +@article{barratArchitectureComplexWeighted2004, + title = {The Architecture of Complex Weighted Networks}, + author = {Barrat, A. and Barthelemy, M. and {Pastor-Satorras}, R. and Vespignani, A.}, + year = {2004}, + journal = {Proceedings of the National Academy of Sciences}, + volume = {101}, + number = {11}, + pages = {3747--3752}, + issn = {0027-8424, 1091-6490}, + doi = {10.1073/pnas.0400087101}, + langid = {english} +} + +@article{jiaMeasuringQuadrangleFormation2020, + title = {Measuring {{Quadrangle Formation}} in {{Complex Networks}}}, + author = {Jia, Mingshan and Gabrys, Bogdan and Musial, Katarzyna}, + year = {2020}, + journal = {arXiv:2011.10763 [cs]}, + eprint = {2011.10763}, + eprinttype = {arxiv}, + primaryclass = {cs}, + abstract = {The classic clustering coefficient and the lately proposed closure coefficient quantify the formation of triangles from two different perspectives, with the focal node at the centre or at the end in an open triad respectively. As many networks are naturally rich in triangles, they become standard metrics to describe and analyse networks. However, the advantages of applying them can be limited in networks, where there are relatively few triangles but which are rich in quadrangles, such as the protein-protein interaction networks, the neural networks and the food webs. This yields for other approaches that would leverage quadrangles in our journey to better understand local structures and their meaning in different types of networks. Here we propose two quadrangle coefficients, i.e., the i-quad coefficient and the o-quad coefficient, to quantify quadrangle formation in networks, and we further extend them to weighted networks. Through experiments on 16 networks from six different domains, we first reveal the density distribution of the two quadrangle coefficients, and then analyse their correlations with node degree. Finally, we demonstrate that at network-level, adding the average i-quad coefficient and the average o-quad coefficient leads to significant improvement in network classification, while at node-level, the i-quad and o-quad coefficients are useful features to improve link prediction.}, + archiveprefix = {arXiv}, + langid = {english}, + keywords = {Computer Science - Social and Information Networks} +} + +@article{vallaranoFastScalableLikelihood2021, + title = {Fast and Scalable Likelihood Maximization for {{Exponential Random Graph Models}} with Local Constraints}, + author = {Vallarano, Nicol{\`o} and Bruno, Matteo and Marchese, Emiliano and Trapani, Giuseppe and Saracco, Fabio and Cimini, Giulio and Zanon, Mario and Squartini, Tiziano}, + year = {2021}, + journal = {Scientific Reports}, + volume = {11}, + number = {1}, + pages = {15227}, + issn = {2045-2322}, + doi = {10.1038/s41598-021-93830-4}, + abstract = {Abstract Exponential Random Graph Models (ERGMs) have gained increasing popularity over the years. Rooted into statistical physics, the ERGMs framework has been successfully employed for reconstructing networks, detecting statistically significant patterns in graphs, counting networked configurations with given properties. From a technical point of view, the ERGMs workflow is defined by two subsequent optimization steps: the first one concerns the maximization of Shannon entropy and leads to identify the functional form of the ensemble probability distribution that is maximally non-committal with respect to the missing information; the second one concerns the maximization of the likelihood function induced by this probability distribution and leads to its numerical determination. This second step translates into the resolution of a system of O ( N ) non-linear, coupled equations (with N being the total number of nodes of the network under analysis), a problem that is affected by three main issues, i.e. accuracy , speed and scalability . The present paper aims at addressing these problems by comparing the performance of three algorithms (i.e. Newton's method, a quasi-Newton method and a recently-proposed fixed-point recipe) in solving several ERGMs, defined by binary and weighted constraints in both a directed and an undirected fashion. While Newton's method performs best for relatively little networks, the fixed-point recipe is to be preferred when large configurations are considered, as it ensures convergence to the solution within seconds for networks with hundreds of thousands of nodes (e.g. the Internet, Bitcoin). We attach to the paper a Python code implementing the three aforementioned algorithms on all the ERGMs considered in the present work.}, + langid = {english} +} + +@article{wattsCollectiveDynamicsSmallworld1998, + title = {Collective Dynamics of `Small-World' Networks}, + author = {Watts, D. J. and Strogatz, S. H.}, + year = {1998}, + journal = {Nature}, + volume = {393}, + number = {6684}, + pages = {440}, + doi = {10.1038/30918}, + langid = {english} +} + +@inproceedings{yinLocalClosureCoefficient2019, + title = {The {{Local Closure Coefficient}}: {{A New Perspective On Network Clustering}}}, + shorttitle = {The {{Local Closure Coefficient}}}, + booktitle = {Proceedings of the {{Twelfth ACM International Conference}} on {{Web Search}} and {{Data Mining}}}, + author = {Yin, Hao and Benson, Austin R. and Leskovec, Jure}, + year = {2019}, + pages = {303--311}, + publisher = {{ACM}}, + address = {{Melbourne VIC Australia}}, + doi = {10.1145/3289600.3290991}, + abstract = {The phenomenon of edge clustering in real-world networks is a fundamental property underlying many ideas and techniques in network science. Clustering is typically quantified by the clustering coefficient, which measures the fraction of pairs of neighbors of a given center node that are connected. However, many common explanations of edge clustering attribute the triadic closure to a ``head'' node instead of the center node of a length-2 path\textemdash for example, ``a friend of my friend is also my friend.'' While such explanations are common in network analysis, there is no measurement for edge clustering that can be attributed to the head node.}, + isbn = {978-1-4503-5940-5}, + langid = {english} +} + + diff --git a/docs/sections/about.rst b/docs/sections/about.rst new file mode 100644 index 0000000..d0e740a --- /dev/null +++ b/docs/sections/about.rst @@ -0,0 +1,6 @@ +About +===== + +.. include:: ./citation.rst +.. include:: ../../AUTHORS.rst +.. include:: ../../HISTORY.rst diff --git a/docs/sections/citation.rst b/docs/sections/citation.rst new file mode 100644 index 0000000..44d4eb8 --- /dev/null +++ b/docs/sections/citation.rst @@ -0,0 +1,9 @@ +How to cite? +------------ + +You find the package useful? Please cite our work properly. + +`Main theory paper `_ + + Talaga, S., & Nowak, A. (2022). Structural complementarity and similarity: + linking relational principles to network structure. arXiv preprint arXiv:2201.03664. diff --git a/docs/sections/coefficients.rst b/docs/sections/coefficients.rst new file mode 100644 index 0000000..2da6b02 --- /dev/null +++ b/docs/sections/coefficients.rst @@ -0,0 +1,4 @@ +Path census & structural coefficients +===================================== + +.. automodule:: pathcensus.pathcensus diff --git a/docs/sections/contributing.rst b/docs/sections/contributing.rst new file mode 100644 index 0000000..ac7b6bc --- /dev/null +++ b/docs/sections/contributing.rst @@ -0,0 +1 @@ +.. include:: ../../CONTRIBUTING.rst diff --git a/docs/sections/graphs.rst b/docs/sections/graphs.rst new file mode 100644 index 0000000..698fa7f --- /dev/null +++ b/docs/sections/graphs.rst @@ -0,0 +1,4 @@ +Integration with graph-like classes +----------------------------------- + +.. automodule:: pathcensus.graph diff --git a/docs/sections/inference.rst b/docs/sections/inference.rst new file mode 100644 index 0000000..06d59c1 --- /dev/null +++ b/docs/sections/inference.rst @@ -0,0 +1,12 @@ +Approximate statistical inference +================================= + +.. note:: + + Additional examples and a demonstration of the correctness + of ``inference`` submodule are presented in the `examples`_ + subfolder in the `github repo`_. + +.. automodule:: pathcensus.inference + +.. include:: /links.rst diff --git a/docs/sections/installation.rst b/docs/sections/installation.rst new file mode 100644 index 0000000..b22d9d2 --- /dev/null +++ b/docs/sections/installation.rst @@ -0,0 +1,89 @@ +============ +Installation +============ + +At the command line via pip + +.. code-block:: + + pip install pathcensus + +The current (unstable) development version can be installed +directly from the `github repo`_ + +.. code-block:: + + pip install git+ssh://git@github.com/sztal/pathcensus.git + + +Development & testing +--------------------- + +The repository with the package source code can be cloned easily +from the `github repo`_. + +.. code-block:: + + git clone git@github.com:sztal/pathcensus.git + +It is recommended to work within an isolated virtual environment. +This can be done easily for instance using `conda`_. +Remember about using a proper Python version (e.g. 3.8 or 3.9). + +.. code-block:: + + conda create --name my-env python=3.8 + conda activate my-env + +After entering the directory in which ``pathcensus`` repository +was cloned it is enough to install the package locally. + +.. code-block:: bash + + pip install . + # Or in developer/editable mode + pip install --editable . + +In order to run tests it is necessary to install also test dependencies. + +.. code-block:: bash + + pip install -r ./requirements-tests.txt + # Now tests can be run + pytest + # Or alternatively + make test + # And to run linter + make lint + +And similarly for building the documentation from source. + +.. code-block:: bash + + pip install -r ./requirements-docs.txt + # Now documentation can be built + make docs + +Tests targeting different Python versions can be run using `tox`_ test +automation framework. You may first need to install `tox`_ +(e.g. ``pip install tox``). + +.. code-block:: bash + + make test-all + # Or alternatively + tox + + +.. include:: /links.rst + +Test coverage +~~~~~~~~~~~~~ + +Unit test coverage report can be generated easily. + +.. code-block:: + + make coverage + # Report can be displayed again after running coverage + make cov-report diff --git a/docs/sections/nullmodels.rst b/docs/sections/nullmodels.rst new file mode 100644 index 0000000..e0bcc2a --- /dev/null +++ b/docs/sections/nullmodels.rst @@ -0,0 +1,25 @@ +Null models +=========== + +.. note:: + + Additional examples and a demonstration of the correctness + of ``nullmodels`` submodule are presented in the `examples`_ + subfolder in the `github repo`_. + +.. automodule:: pathcensus.nullmodels.base + + +Undirected Binary Configuration Model (UBCM) +-------------------------------------------- + +.. automodule:: pathcensus.nullmodels.ubcm + + +Undirected Enhanced Configuration Model (UECM) +---------------------------------------------- + +.. automodule:: pathcensus.nullmodels.uecm + + +.. include:: /links.rst diff --git a/docs/sections/overview.rst b/docs/sections/overview.rst new file mode 100644 index 0000000..fd8918b --- /dev/null +++ b/docs/sections/overview.rst @@ -0,0 +1,41 @@ +Overview +======== + +Welcome to the documentation of ``pathcensus`` package. +It is a Python (3.8+) implementation of **structural similarity and +complementarity coefficients** for undirected (un)weighted networks based +on efficient counting of 2- and 3-paths (triples and quadruples) +and 3- and 4-cycles (triangles and quadrangles). + +**Structural coefficients are graph-theoretic +measures of the extent to which relations at different levels +(of edges, nodes or entire networks) are driven by similarity or +complementarity between different nodes**. Even though they are defined +in purely combinatorial manner they are motivated by geometric arguments +which link them to the family of latent space/random geometric graph models. +In particular, the geometric view allow the identification of network motifs +charactersitic for similarity (triangles) and complementarity (quadrangles). +They can be seen as a generalization of the well-known +local and global clustering coefficients which summarize the structure +of a network in terms of density of ego subgraph(s). + +Even though it is a Python package ``pathcensus`` is performant as its main +workhorse functions are just-in-time (JIT) compiled to efficient C code +thanks to the `numba`_ library. It is compatible with `numpy`_ +arrays and `scipy`_ sparse matrices making it easy to use in practice. +Moreover, it allows registering graph classes implemented by different +third-party packages such as `networkx`_ so they can be converted +automatically to sparse matrices. Conversion methods for `networkx`_, +`igraph`_ and `graph-tool`_ are registered automatically +provided the packages are installed. + +For the sake of convenience ``pathcensus`` provides also implementations +of most appropriate null models for statistical calibration of structural +coefficients which are simple wrappers around the excellent `NEMtropy`_ +package :cite:p:`vallaranoFastScalableLikelihood2021`. It also defines +the :py:mod:`pathcensus.inference` submodule with utility class for +facilitating approximate statistical inference based on sampling from +null models. + +.. include:: /sections/citation.rst +.. include:: /links.rst diff --git a/docs/sections/references.rst b/docs/sections/references.rst new file mode 100644 index 0000000..3d0c6fb --- /dev/null +++ b/docs/sections/references.rst @@ -0,0 +1,4 @@ +References +========== + +.. bibliography:: diff --git a/examples/1-motif-counting-tests.ipynb b/examples/1-motif-counting-tests.ipynb new file mode 100644 index 0000000..cc790ce --- /dev/null +++ b/examples/1-motif-counting-tests.ipynb @@ -0,0 +1,2377 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tests of motif counting functions\n", + "\n", + "Here we show a set of test cases for motif counting functions\n", + "and structural coefficients which are simple enough to be checked\n", + "visually by humans. Example graphs in this notebook are generated using\n", + "the `igraph` package so it needs to be installed. Plotting requires\n", + "also `pycairo` or `cairocffi` package." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import igraph as ig\n", + "from pathcensus import PathCensus\n", + "\n", + "\n", + "def plot(graph, bbox=(300,300), vertex_label=None, vertex_color=\"gray\", **kwds):\n", + " \"\"\"Simple graph plotting function.\n", + " \n", + " It most cases it can be run without using any non-default values\n", + " of the arguments.\n", + " \"\"\"\n", + " if vertex_label is None:\n", + " vertex_label = np.arange(graph.vcount())\n", + " return ig.plot(graph, bbox=bbox, vertex_label=vertex_label, \n", + " vertex_color=vertex_color, **kwds)\n", + "\n", + "def add_random_weights(graph, vmin=1, vmax=10, seed=None):\n", + " \"\"\"Add random integer edge weights in the range ``[vmin, vmax-1]``.\"\"\"\n", + " if seed is not None:\n", + " np.random.seed(seed)\n", + " graph = graph.copy()\n", + " weights = np.random.randint(vmin, vmax+1, (graph.ecount(),))\n", + " graph.es[\"weight\"] = weights\n", + " return graph" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Triangle\n", + "\n", + "Triangle based measure are $t$-clustering (`tclust`), $t$-closure (`tclosure`)\n", + "and structural similarity (at the level of edges, nodes and global). Below\n", + "are their mathematical definitions which can be used to check correctness\n", + "of the calculations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n \n\n\n \n\n\n\n", + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": { + "image/svg+xml": { + "isolated": true + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "G = ig.Graph(directed=False)\n", + "G.add_vertices(3)\n", + "G.add_edges([\n", + " (0, 1), (1, 2), (2, 0)\n", + "])\n", + "\n", + "# Calculate path census\n", + "P = PathCensus(G)\n", + "\n", + "plot(G)" + ] + }, + { + "cell_type": "markdown", + "id": "645498d9", + "metadata": {}, + "source": [ + "### Node-level measures\n", + "\n", + "#### `tclust` ($t$-clustering or local clustering coefficient)\n", + "\n", + "$$\n", + "s^W_i = \\frac{2T_i}{t^W_i}\n", + "$$\n", + "\n", + "#### `tclosure` ($t$-closure or local closure coefficient)\n", + "\n", + "$$\n", + "s^H_i = \\frac{2T_i}{t^H_i}\n", + "$$\n", + "\n", + "#### `sim` (structural similarity)\n", + "\n", + "$$\n", + "s_i = \\frac{4T_i}{t^W_i + t^H_i}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "acce04e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
simtclusttclosurecompqclustqclosurettwthq0qwqh
i
01.01.01.0NaNNaNNaN122000
11.01.01.0NaNNaNNaN122000
21.01.01.0NaNNaNNaN122000
\n", + "
" + ], + "text/plain": [ + " sim tclust tclosure comp qclust qclosure t tw th q0 qw qh\n", + "i \n", + "0 1.0 1.0 1.0 NaN NaN NaN 1 2 2 0 0 0\n", + "1 1.0 1.0 1.0 NaN NaN NaN 1 2 2 0 0 0\n", + "2 1.0 1.0 1.0 NaN NaN NaN 1 2 2 0 0 0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "P.coefs(\"nodes\", census=True)" + ] + }, + { + "cell_type": "markdown", + "id": "74140b87", + "metadata": {}, + "source": [ + "### Edge-level measures\n", + "\n", + "#### Edge-wise similarity\n", + "\n", + "$$\n", + "s_{ij} = \\frac{2T_{ij}}{t^W_{ij} + t^H_{ij}}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
simcompttwthq0qwqh
ij
011.0NaN111000
21.0NaN111000
101.0NaN111000
21.0NaN111000
201.0NaN111000
11.0NaN111000
\n", + "
" + ], + "text/plain": [ + " sim comp t tw th q0 qw qh\n", + "i j \n", + "0 1 1.0 NaN 1 1 1 0 0 0\n", + " 2 1.0 NaN 1 1 1 0 0 0\n", + "1 0 1.0 NaN 1 1 1 0 0 0\n", + " 2 1.0 NaN 1 1 1 0 0 0\n", + "2 0 1.0 NaN 1 1 1 0 0 0\n", + " 1 1.0 NaN 1 1 1 0 0 0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "P.coefs(\"edges\", census=True)" + ] + }, + { + "cell_type": "markdown", + "id": "ddce4255", + "metadata": {}, + "source": [ + "### Global measures\n", + "\n", + "All three global measures are equivalent.\n", + "\n", + "#### Global clustering\n", + "\n", + "$$\n", + "s^W = \\frac{3T}{t^W}\n", + "$$\n", + "\n", + "#### Global closure\n", + "\n", + "$$\n", + "s^H = \\frac{3T}{t^H}\n", + "$$\n", + "\n", + "#### Global similarity\n", + "\n", + "$$\n", + "s = s^W = s^H = \\frac{6T}{t^H + t^W}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4a9137c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sim_gsimtclusttclosurecomp_gcompqclustqclosurettwthq0qwqh
01.01.01.01.0NaNNaNNaNNaN133000
\n", + "
" + ], + "text/plain": [ + " sim_g sim tclust tclosure comp_g comp qclust qclosure t tw th \\\n", + "0 1.0 1.0 1.0 1.0 NaN NaN NaN NaN 1 3 3 \n", + "\n", + " q0 qw qh \n", + "0 0 0 0 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "P.coefs(\"global\", census=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Strong quadrangle" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n \n\n\n \n\n\n \n\n\n\n", + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": { + "image/svg+xml": { + "isolated": true + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "G = ig.Graph(directed=False)\n", + "G.add_vertices(4)\n", + "G.add_edges([\n", + " (0, 1), (1, 2), (2, 3), (3, 0),\n", + "])\n", + "\n", + "# Calculate path census\n", + "P = PathCensus(G)\n", + "\n", + "plot(G)" + ] + }, + { + "cell_type": "markdown", + "id": "fd161244", + "metadata": {}, + "source": [ + "### Node-level measures\n", + "\n", + "#### `qclust` ($q$-clustering)\n", + "\n", + "$$\n", + "c^W_i = \\frac{2Q_i}{q^W_i}\n", + "$$\n", + "\n", + "#### `qclosure` ($q$-closure)\n", + "\n", + "$$\n", + "c^H_i = \\frac{2Q_i}{q^H_i}\n", + "$$\n", + "\n", + "#### `comp` (structural complementarity)\n", + "\n", + "$$\n", + "c_i = \\frac{4Q_i}{q^W_i + q^H_i}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
simtclusttclosurecompqclustqclosurettwthq0qwqh
i
00.00.00.01.01.01.0022122
10.00.00.01.01.01.0022122
20.00.00.01.01.01.0022122
30.00.00.01.01.01.0022122
\n", + "
" + ], + "text/plain": [ + " sim tclust tclosure comp qclust qclosure t tw th q0 qw qh\n", + "i \n", + "0 0.0 0.0 0.0 1.0 1.0 1.0 0 2 2 1 2 2\n", + "1 0.0 0.0 0.0 1.0 1.0 1.0 0 2 2 1 2 2\n", + "2 0.0 0.0 0.0 1.0 1.0 1.0 0 2 2 1 2 2\n", + "3 0.0 0.0 0.0 1.0 1.0 1.0 0 2 2 1 2 2" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "P.coefs(\"nodes\", census=True)" + ] + }, + { + "cell_type": "markdown", + "id": "ad3275ca", + "metadata": {}, + "source": [ + "### Edge-level measures\n", + "\n", + "#### Edge-wise complementarity\n", + "\n", + "$$\n", + "c_{ij} = \\frac{2Q_{ij}}{q^W_{ij} + q^H_{ij}}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2753cc3e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
simcompttwthq0qwqh
ij
010.01.0011111
30.01.0011111
100.01.0011111
20.01.0011111
210.01.0011111
30.01.0011111
300.01.0011111
20.01.0011111
\n", + "
" + ], + "text/plain": [ + " sim comp t tw th q0 qw qh\n", + "i j \n", + "0 1 0.0 1.0 0 1 1 1 1 1\n", + " 3 0.0 1.0 0 1 1 1 1 1\n", + "1 0 0.0 1.0 0 1 1 1 1 1\n", + " 2 0.0 1.0 0 1 1 1 1 1\n", + "2 1 0.0 1.0 0 1 1 1 1 1\n", + " 3 0.0 1.0 0 1 1 1 1 1\n", + "3 0 0.0 1.0 0 1 1 1 1 1\n", + " 2 0.0 1.0 0 1 1 1 1 1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "P.coefs(\"edges\", census=True)" + ] + }, + { + "cell_type": "markdown", + "id": "bab144e6", + "metadata": {}, + "source": [ + "### Global measures\n", + "\n", + "All global measures are equivalent.\n", + "\n", + "#### Global $q$-clustering\n", + "\n", + "$$\n", + "c^W = \\frac{4Q}{q^W}\n", + "$$\n", + "\n", + "#### Global $q$-closure\n", + "\n", + "$$\n", + "c^H = \\frac{4Q}{q^H}\n", + "$$\n", + "\n", + "#### Global complementarity\n", + "\n", + "$$\n", + "c = c^W = c^H = \\frac{8Q}{q^W + q^H}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "300dddbf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sim_gsimtclusttclosurecomp_gcompqclustqclosurettwthq0qwqh
00.00.00.00.01.01.01.01.0044144
\n", + "
" + ], + "text/plain": [ + " sim_g sim tclust tclosure comp_g comp qclust qclosure t tw th \\\n", + "0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 0 4 4 \n", + "\n", + " q0 qw qh \n", + "0 1 4 4 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "P.coefs(\"global\", census=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fully-connected network\n", + "\n", + "Here we confirm that structural similarity is maximal in fully-connected\n", + "networks. And we check this for a network with random weights to show\n", + "that at the same time that weighted coefficients are properly normalized\n", + "in the `[0, 1]` range." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n \n\n\n \n\n\n \n\n\n \n\n\n\n", + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": { + "image/svg+xml": { + "isolated": true + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "G = ig.Graph.Full(5)\n", + "G = add_random_weights(G)\n", + "\n", + "# Calculate path census\n", + "P = PathCensus(G)\n", + "\n", + "plot(G)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
simtclusttclosurecompqclustqclosuretwcthctwthq0wcq0hcqwqh
i
01.01.01.00.00.00.039.042.5078.085.00.00.0165.333333174.666667
11.01.01.00.00.00.055.545.25111.090.50.00.0194.666667167.333333
21.01.01.00.00.00.037.542.2575.084.50.00.0162.666667175.333333
31.01.01.00.00.00.043.543.2587.086.50.00.0173.333333172.666667
41.01.01.00.00.00.040.542.7581.085.50.00.0168.000000174.000000
\n", + "
" + ], + "text/plain": [ + " sim tclust tclosure comp qclust qclosure twc thc tw th \\\n", + "i \n", + "0 1.0 1.0 1.0 0.0 0.0 0.0 39.0 42.50 78.0 85.0 \n", + "1 1.0 1.0 1.0 0.0 0.0 0.0 55.5 45.25 111.0 90.5 \n", + "2 1.0 1.0 1.0 0.0 0.0 0.0 37.5 42.25 75.0 84.5 \n", + "3 1.0 1.0 1.0 0.0 0.0 0.0 43.5 43.25 87.0 86.5 \n", + "4 1.0 1.0 1.0 0.0 0.0 0.0 40.5 42.75 81.0 85.5 \n", + "\n", + " q0wc q0hc qw qh \n", + "i \n", + "0 0.0 0.0 165.333333 174.666667 \n", + "1 0.0 0.0 194.666667 167.333333 \n", + "2 0.0 0.0 162.666667 175.333333 \n", + "3 0.0 0.0 173.333333 172.666667 \n", + "4 0.0 0.0 168.000000 174.000000 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## NODES\n", + "P.coefs(\"nodes\", census=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a1539b58", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
simcomptwcthctwthq0wcq0hcqwqh
ij
011.00.022.027.522.027.50.00.041.33333348.666667
21.00.018.017.518.017.50.00.041.33333340.666667
31.00.021.022.521.022.50.00.044.66666746.666667
41.00.017.017.517.017.50.00.038.00000038.666667
101.00.027.522.027.522.00.00.048.66666741.333333
21.00.027.521.527.521.50.00.049.33333341.333333
31.00.027.523.527.523.50.00.046.66666741.333333
41.00.028.523.528.523.50.00.050.00000043.333333
201.00.017.518.017.518.00.00.040.66666741.333333
11.00.021.527.521.527.50.00.041.33333349.333333
31.00.017.519.517.519.50.00.038.66666741.333333
41.00.018.519.518.519.50.00.042.00000043.333333
301.00.022.521.022.521.00.00.046.66666744.666667
11.00.023.527.523.527.50.00.041.33333346.666667
21.00.019.517.519.517.50.00.041.33333338.666667
41.00.021.520.521.520.50.00.044.00000042.666667
401.00.017.517.017.517.00.00.038.66666738.000000
11.00.023.528.523.528.50.00.043.33333350.000000
21.00.019.518.519.518.50.00.043.33333342.000000
31.00.020.521.520.521.50.00.042.66666744.000000
\n", + "
" + ], + "text/plain": [ + " sim comp twc thc tw th q0wc q0hc qw qh\n", + "i j \n", + "0 1 1.0 0.0 22.0 27.5 22.0 27.5 0.0 0.0 41.333333 48.666667\n", + " 2 1.0 0.0 18.0 17.5 18.0 17.5 0.0 0.0 41.333333 40.666667\n", + " 3 1.0 0.0 21.0 22.5 21.0 22.5 0.0 0.0 44.666667 46.666667\n", + " 4 1.0 0.0 17.0 17.5 17.0 17.5 0.0 0.0 38.000000 38.666667\n", + "1 0 1.0 0.0 27.5 22.0 27.5 22.0 0.0 0.0 48.666667 41.333333\n", + " 2 1.0 0.0 27.5 21.5 27.5 21.5 0.0 0.0 49.333333 41.333333\n", + " 3 1.0 0.0 27.5 23.5 27.5 23.5 0.0 0.0 46.666667 41.333333\n", + " 4 1.0 0.0 28.5 23.5 28.5 23.5 0.0 0.0 50.000000 43.333333\n", + "2 0 1.0 0.0 17.5 18.0 17.5 18.0 0.0 0.0 40.666667 41.333333\n", + " 1 1.0 0.0 21.5 27.5 21.5 27.5 0.0 0.0 41.333333 49.333333\n", + " 3 1.0 0.0 17.5 19.5 17.5 19.5 0.0 0.0 38.666667 41.333333\n", + " 4 1.0 0.0 18.5 19.5 18.5 19.5 0.0 0.0 42.000000 43.333333\n", + "3 0 1.0 0.0 22.5 21.0 22.5 21.0 0.0 0.0 46.666667 44.666667\n", + " 1 1.0 0.0 23.5 27.5 23.5 27.5 0.0 0.0 41.333333 46.666667\n", + " 2 1.0 0.0 19.5 17.5 19.5 17.5 0.0 0.0 41.333333 38.666667\n", + " 4 1.0 0.0 21.5 20.5 21.5 20.5 0.0 0.0 44.000000 42.666667\n", + "4 0 1.0 0.0 17.5 17.0 17.5 17.0 0.0 0.0 38.666667 38.000000\n", + " 1 1.0 0.0 23.5 28.5 23.5 28.5 0.0 0.0 43.333333 50.000000\n", + " 2 1.0 0.0 19.5 18.5 19.5 18.5 0.0 0.0 43.333333 42.000000\n", + " 3 1.0 0.0 20.5 21.5 20.5 21.5 0.0 0.0 42.666667 44.000000" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## EDGES\n", + "P.coefs(\"edges\", census=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "05af3bde", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sim_gsimtclusttclosurecomp_gcompqclustqclosuretwcthctwthq0wcq0hcqwqh
01.01.01.01.00.00.00.00.072.072.0216.0216.00.00.0432.0432.0
\n", + "
" + ], + "text/plain": [ + " sim_g sim tclust tclosure comp_g comp qclust qclosure twc thc \\\n", + "0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 72.0 72.0 \n", + "\n", + " tw th q0wc q0hc qw qh \n", + "0 216.0 216.0 0.0 0.0 432.0 432.0 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## GLOBAL\n", + "P.coefs(\"global\", census=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fully-connected bipartite network\n", + "\n", + "Here we confirm that structural complementarity is maximal in fully-connected\n", + "bipartite networks. And we check this for a network with random weights to \n", + "show that at the same time that weighted coefficients are properly normalized\n", + "in the `[0, 1]` range." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n \n\n\n \n\n\n \n\n\n \n\n\n \n\n\n\n", + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": { + "image/svg+xml": { + "isolated": true + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "G = ig.Graph.Full_Bipartite(3, 3)\n", + "G = add_random_weights(G)\n", + "\n", + "# Calculate path census\n", + "P = PathCensus(G)\n", + "\n", + "plot(G, layout=G.layout_bipartite())" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
simtclusttclosurecompqclustqclosuretwcthctwthq0wcq0hcqwqh
i
00.00.00.01.01.01.00.00.030.034.533.036.066.072.0
10.00.00.01.01.01.00.00.044.038.040.036.080.072.0
20.00.00.01.01.01.00.00.034.035.535.036.070.072.0
30.00.00.01.01.01.00.00.028.034.032.036.064.072.0
40.00.00.01.01.01.00.00.040.037.038.036.076.072.0
50.00.00.01.01.01.00.00.040.037.038.036.076.072.0
\n", + "
" + ], + "text/plain": [ + " sim tclust tclosure comp qclust qclosure twc thc tw th q0wc \\\n", + "i \n", + "0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 30.0 34.5 33.0 \n", + "1 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 44.0 38.0 40.0 \n", + "2 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 34.0 35.5 35.0 \n", + "3 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 28.0 34.0 32.0 \n", + "4 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 40.0 37.0 38.0 \n", + "5 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 40.0 37.0 38.0 \n", + "\n", + " q0hc qw qh \n", + "i \n", + "0 36.0 66.0 72.0 \n", + "1 36.0 80.0 72.0 \n", + "2 36.0 70.0 72.0 \n", + "3 36.0 64.0 72.0 \n", + "4 36.0 76.0 72.0 \n", + "5 36.0 76.0 72.0 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## NODES\n", + "P.coefs(\"nodes\", census=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "aa88676d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
simcomptwcthctwthq0wcq0hcqwqh
ij
030.01.00.00.09.08.521.33333320.66666721.33333320.666667
40.01.00.00.011.013.523.33333326.66666723.33333326.666667
50.01.00.00.010.012.521.33333324.66666721.33333324.666667
130.01.00.00.014.010.026.66666721.33333326.66666721.333333
40.01.00.00.014.013.024.66666723.33333324.66666723.333333
50.01.00.00.016.015.028.66666727.33333328.66666727.333333
230.01.00.00.011.09.524.00000022.00000024.00000022.000000
40.01.00.00.012.013.524.00000026.00000024.00000026.000000
50.01.00.00.011.012.522.00000024.00000022.00000024.000000
300.01.00.00.08.59.020.66666721.33333320.66666721.333333
10.01.00.00.010.014.021.33333326.66666721.33333326.666667
20.01.00.00.09.511.022.00000024.00000022.00000024.000000
400.01.00.00.013.511.026.66666723.33333326.66666723.333333
10.01.00.00.013.014.023.33333324.66666723.33333324.666667
20.01.00.00.013.512.026.00000024.00000026.00000024.000000
500.01.00.00.012.510.024.66666721.33333324.66666721.333333
10.01.00.00.015.016.027.33333328.66666727.33333328.666667
20.01.00.00.012.511.024.00000022.00000024.00000022.000000
\n", + "
" + ], + "text/plain": [ + " sim comp twc thc tw th q0wc q0hc qw \\\n", + "i j \n", + "0 3 0.0 1.0 0.0 0.0 9.0 8.5 21.333333 20.666667 21.333333 \n", + " 4 0.0 1.0 0.0 0.0 11.0 13.5 23.333333 26.666667 23.333333 \n", + " 5 0.0 1.0 0.0 0.0 10.0 12.5 21.333333 24.666667 21.333333 \n", + "1 3 0.0 1.0 0.0 0.0 14.0 10.0 26.666667 21.333333 26.666667 \n", + " 4 0.0 1.0 0.0 0.0 14.0 13.0 24.666667 23.333333 24.666667 \n", + " 5 0.0 1.0 0.0 0.0 16.0 15.0 28.666667 27.333333 28.666667 \n", + "2 3 0.0 1.0 0.0 0.0 11.0 9.5 24.000000 22.000000 24.000000 \n", + " 4 0.0 1.0 0.0 0.0 12.0 13.5 24.000000 26.000000 24.000000 \n", + " 5 0.0 1.0 0.0 0.0 11.0 12.5 22.000000 24.000000 22.000000 \n", + "3 0 0.0 1.0 0.0 0.0 8.5 9.0 20.666667 21.333333 20.666667 \n", + " 1 0.0 1.0 0.0 0.0 10.0 14.0 21.333333 26.666667 21.333333 \n", + " 2 0.0 1.0 0.0 0.0 9.5 11.0 22.000000 24.000000 22.000000 \n", + "4 0 0.0 1.0 0.0 0.0 13.5 11.0 26.666667 23.333333 26.666667 \n", + " 1 0.0 1.0 0.0 0.0 13.0 14.0 23.333333 24.666667 23.333333 \n", + " 2 0.0 1.0 0.0 0.0 13.5 12.0 26.000000 24.000000 26.000000 \n", + "5 0 0.0 1.0 0.0 0.0 12.5 10.0 24.666667 21.333333 24.666667 \n", + " 1 0.0 1.0 0.0 0.0 15.0 16.0 27.333333 28.666667 27.333333 \n", + " 2 0.0 1.0 0.0 0.0 12.5 11.0 24.000000 22.000000 24.000000 \n", + "\n", + " qh \n", + "i j \n", + "0 3 20.666667 \n", + " 4 26.666667 \n", + " 5 24.666667 \n", + "1 3 21.333333 \n", + " 4 23.333333 \n", + " 5 27.333333 \n", + "2 3 22.000000 \n", + " 4 26.000000 \n", + " 5 24.000000 \n", + "3 0 21.333333 \n", + " 1 26.666667 \n", + " 2 24.000000 \n", + "4 0 23.333333 \n", + " 1 24.666667 \n", + " 2 24.000000 \n", + "5 0 21.333333 \n", + " 1 28.666667 \n", + " 2 22.000000 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## EDGES\n", + "P.coefs(\"edges\", census=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "74b75f8c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sim_gsimtclusttclosurecomp_gcompqclustqclosuretwcthctwthq0wcq0hcqwqh
00.00.00.00.01.01.01.01.00.00.0108.0108.054.054.0216.0216.0
\n", + "
" + ], + "text/plain": [ + " sim_g sim tclust tclosure comp_g comp qclust qclosure twc thc \\\n", + "0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 0.0 0.0 \n", + "\n", + " tw th q0wc q0hc qw qh \n", + "0 108.0 108.0 54.0 54.0 216.0 216.0 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## GLOBAL\n", + "P.coefs(\"global\", census=True)" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "8e0900f3da58897fd209d64081d3031deeb7de3d74b04d540bcaacc585fcac50" + }, + "kernelspec": { + "display_name": "Python 3.9.7 64-bit ('ssc-paper': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/2-null-model-tests.ipynb b/examples/2-null-model-tests.ipynb new file mode 100644 index 0000000..770ba02 --- /dev/null +++ b/examples/2-null-model-tests.ipynb @@ -0,0 +1,571 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "In this notebook we present a set of basic tests of the implementations\n", + "of null models provided by `pathcensus` package. All the null models\n", + "are also tested against an automated suite of unit test, but we additionally\n", + "provide the below examples as the notebook format is argurably much easier\n", + "to follow. We use `igraph` package to generate graphs.\n", + "\n", + "We defined all models following the formulas and terminology introduced in:\n", + "\n", + "> [1] Squartini, T., Mastrandrea, R., & Garlaschelli, D. (2015). \n", + "> Unbiased sampling of network ensembles. \n", + "> New Journal of Physics, 17(2), 023052. https://doi.org/10.1088/1367-2630/17/2/023052\n", + "\n", + "and:\n", + "\n", + "> [2] Vallarano, N., Bruno, M., Marchese, E., Trapani, G., Saracco, F., Cimini, G., Zanon, M., & Squartini, T. (2021). Fast and scalable likelihood maximization for Exponential Random Graph Models with local constraints. Scientific Reports, 11(1), 15227. https://doi.org/10.1038/s41598-021-93830-4\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f913aa1b", + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "import numpy as np\n", + "import igraph as ig\n", + "from pathcensus.nullmodels import UBCM, UECM\n", + "from pathcensus.utils import rowsums, relclose\n", + "\n", + "def add_random_weights(graph):\n", + " graph = graph.copy()\n", + " graph.es[\"weight\"] = np.random.randint(1, 11, (graph.ecount(),))\n", + " return graph\n", + "\n", + "def make_er_graph(n, dbar):\n", + " p = dbar / (n-1)\n", + " return ig.Graph.Erdos_Renyi(n, p=p, directed=False)\n", + "\n", + "def make_rgg(n, dbar):\n", + " radius = np.sqrt(dbar/(np.pi*(n-1)))\n", + " return ig.Graph.GRG(n, radius=radius, torus=True)\n", + "\n", + "# Global parameters\n", + "# -----------------\n", + "N_NODES = 100 # number of nodes in random graphs\n", + "KBAR = 10 # expected average degree in random graphs\n", + "RTOL = 1e-1 # relative tolerance when comparing simulated and expected values\n", + "N_SAMPLES = 1000 # number of samples using for stochastic testing of expectations" + ] + }, + { + "cell_type": "markdown", + "id": "dfa28229", + "metadata": {}, + "source": [ + "## Undirected Binary Configuration Model (UBCM)\n", + "\n", + "This is a soft (canonical) configuration model for undirected, unweighted\n", + "networks. It is defined in Sec. 3.1 and Eq. (8) in [1].\n", + "\n", + "For this model we will test whether node degrees are indeed reproduced\n", + "in expectation, which is exactly what the model should do. We will test\n", + "this on two small random graph with very different structure:\n", + "\n", + "1. Erdős–Rényi random graph\n", + "2. Random geometric graph\n", + "\n", + "Both graphs will have $100$ nodes and average degrees equal to $10$ approximately." + ] + }, + { + "cell_type": "markdown", + "id": "b651e17d", + "metadata": {}, + "source": [ + "### ER random graph" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "deefaa46", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "random.seed(303)\n", + "\n", + "graph = make_er_graph(N_NODES, KBAR)\n", + "degseq = np.array(graph.degree())\n", + "\n", + "ubcm = UBCM(graph)\n", + "ubcm.fit()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f976d48e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST ANALYTICAL EXPECTED DEGREES\n", + "relclose(ubcm.ED, degseq, rtol=RTOL)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ba3c02c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST EXPECTATION THROUGH SAMPLING\n", + "expected = np.zeros_like(degseq, dtype=float)\n", + "\n", + "for randomized in ubcm.sample(N_SAMPLES):\n", + " # Sample graph realizations are adjacency matrices\n", + " expected += rowsums(randomized)\n", + "\n", + "expected = expected / N_SAMPLES\n", + "\n", + "relclose(expected, degseq, rtol=RTOL) " + ] + }, + { + "cell_type": "markdown", + "id": "7086d2de", + "metadata": {}, + "source": [ + "### Random geometric graph" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "67e2aec4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "random.seed(304)\n", + "\n", + "graph = make_rgg(N_NODES, KBAR)\n", + "degseq = np.array(graph.degree())\n", + "\n", + "ubcm = UBCM(graph)\n", + "ubcm.fit()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "55ee4fed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST ANALYTICAL EXPECTED DEGREES\n", + "relclose(ubcm.ED, degseq, rtol=RTOL)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f8d8c329", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST EXPECTATION THROUGH SAMPLING\n", + "expected = np.zeros_like(degseq, dtype=float)\n", + "\n", + "for randomized in ubcm.sample(N_SAMPLES):\n", + " # Sample graph realizations are adjacency matrices\n", + " expected += rowsums(randomized)\n", + "\n", + "expected = expected / N_SAMPLES\n", + "\n", + "relclose(expected, degseq, rtol=RTOL) " + ] + }, + { + "cell_type": "markdown", + "id": "c4ffcdff", + "metadata": {}, + "source": [ + "## Undirected Enhanced Configuration Model\n", + "\n", + "This null model constrains both expected degree sequence and strength\n", + "sequence. We test it again against ER and RGG networks, but this time\n", + "we also add random edge weights between $1$ and $10$.\n", + "\n", + "### ER random graph" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "142ef11e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "random.seed(305)\n", + "\n", + "graph = make_er_graph(N_NODES, KBAR)\n", + "graph = add_random_weights(graph)\n", + "D = np.array(graph.degree())\n", + "S = np.array(graph.strength(weights=\"weight\"))\n", + "\n", + "uecm = UECM(graph)\n", + "uecm.fit()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bf60309d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST ANALYTICAL EXPECTED DEGREES\n", + "relclose(uecm.ED, D, rtol=RTOL)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "74b04ce7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST EXPECTATION THROUGH SAMPLING\n", + "expected = np.zeros_like(degseq, dtype=float)\n", + "\n", + "for randomized in uecm.sample(N_SAMPLES):\n", + " # Sample graph realizations are adjacency matrices\n", + " randomized.data[:] = 1\n", + " expected += rowsums(randomized)\n", + "\n", + "expected = expected / N_SAMPLES\n", + "\n", + "relclose(expected, D, rtol=RTOL) " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "0b7cbd26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST ANALYTICAL EXPECTED STRENGTHS\n", + "relclose(uecm.ES, S, rtol=RTOL)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5a0afe4c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST EXPECTATION THROUGH SAMPLING\n", + "expected = np.zeros_like(degseq, dtype=float)\n", + "\n", + "for randomized in uecm.sample(N_SAMPLES):\n", + " # Sample graph realizations are adjacency matrices\n", + " expected += rowsums(randomized)\n", + "\n", + "expected = expected / N_SAMPLES\n", + "\n", + "relclose(expected, S, rtol=RTOL) " + ] + }, + { + "cell_type": "markdown", + "id": "3936b637", + "metadata": {}, + "source": [ + "### Random geometric graph" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ae9c09e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "random.seed(306)\n", + "\n", + "graph = make_rgg(N_NODES, KBAR)\n", + "graph = add_random_weights(graph)\n", + "D = np.array(graph.degree())\n", + "S = np.array(graph.strength(weights=\"weight\"))\n", + "\n", + "uecm = UECM(graph)\n", + "uecm.fit()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "84e7e266", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST ANALYTICAL EXPECTED DEGREES\n", + "relclose(uecm.ED, D, rtol=RTOL)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1b222781", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST EXPECTATION THROUGH SAMPLING\n", + "expected = np.zeros_like(degseq, dtype=float)\n", + "\n", + "for randomized in uecm.sample(N_SAMPLES):\n", + " # Sample graph realizations are adjacency matrices\n", + " randomized.data[:] = 1\n", + " expected += rowsums(randomized)\n", + "\n", + "expected = expected / N_SAMPLES\n", + "\n", + "relclose(expected, D, rtol=RTOL) " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c1acefc4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST ANALYTICAL EXPECTED STRENGTHS\n", + "relclose(uecm.ES, S, rtol=RTOL)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "951cd4df", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TEST EXPECTATION THROUGH SAMPLING\n", + "expected = np.zeros_like(degseq, dtype=float)\n", + "\n", + "for randomized in uecm.sample(N_SAMPLES):\n", + " # Sample graph realizations are adjacency matrices\n", + " expected += rowsums(randomized)\n", + "\n", + "expected = expected / N_SAMPLES\n", + "\n", + "relclose(expected, S, rtol=RTOL) " + ] + } + ], + "metadata": { + "interpreter": { + "hash": "8e0900f3da58897fd209d64081d3031deeb7de3d74b04d540bcaacc585fcac50" + }, + "kernelspec": { + "display_name": "Python 3.9.7 64-bit ('ssc-paper': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + }, + "orig_nbformat": 3 + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/3-inference-tests.ipynb b/examples/3-inference-tests.ipynb new file mode 100644 index 0000000..2ca2bad --- /dev/null +++ b/examples/3-inference-tests.ipynb @@ -0,0 +1,516 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "In this notebook we present a set of basic tests of correctness of auxiliary\n", + "classes and methods for conducting statistical inference based on ERGM\n", + "null models implemented in `pathcensus` package. Everything we show here\n", + "is also tested in the automated unit test suite, but we provide also\n", + "a notebook-based confirmation as it is arguably much easier to follow.\n", + "\n", + "We focus on testing methods for estimating $p$-values of observed\n", + "edge/node/graph structural coefficients relative to a null model\n", + "(see notebook `2-null-models-test.ipynb` for more details).\n", + "More conretely, we will consider the two following test cases:\n", + "\n", + "1. **Erdős–Rényi random graph.** In this case we expect all structural\n", + " coefficients on all levels (edges/nodes/graph) to be insignificant\n", + " with type I error rate not greater than $\\alpha$ when using a proper\n", + " adjustment for multiple testing (FDR procedure by Benjamini and Hochberg).\n", + "2. **Random geometric graph (RGG),** As above but in this case we expect\n", + " similarity coefficients to be significantly larger than null model\n", + " expectations.\n", + "\n", + "For null model we will use Undirected Binary Configuration Model (UBCM)\n", + "implemented in `pathcensus` package." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f913aa1b", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import igraph as ig\n", + "from pathcensus import PathCensus\n", + "from pathcensus.nullmodels import UBCM\n", + "from pathcensus.inference import Inference\n", + "from pathcensus.utils import set_seed\n", + "\n", + "def make_er_graph(n, dbar):\n", + " p = dbar / (n-1)\n", + " return ig.Graph.Erdos_Renyi(n, p=p, directed=False)\n", + "\n", + "def make_rgg(n, dbar):\n", + " radius = np.sqrt(dbar/(np.pi*(n-1)))\n", + " return ig.Graph.GRG(n, radius=radius, torus=True)\n", + "\n", + "# Global parameters\n", + "# -----------------\n", + "N_NODES = 100 # number of nodes in random graphs\n", + "KBAR = 10 # expected average degree in random graphs\n", + "ALPHA = 0.01 # Upper bound for type I error rate with the FDR correction\n", + "N_SAMPLES = 100 # number of samples used for estimating p-values " + ] + }, + { + "cell_type": "markdown", + "id": "dfa28229", + "metadata": {}, + "source": [ + "## Implementing statistical inference procedure\n", + "\n", + "In this project we conduct most of statistical inference based on simple\n", + "exponential random graph models (ERGM), primarily different variants\n", + "of the configuration model.\n", + "\n", + "Comparisons between observed values of various graph properties and\n", + "expectations based on a null model are done according to the following scheme:\n", + "\n", + "1. Calculate observed values of graph statistics of interest and\n", + " index each node with its corresponding sufficient statistic(s).\n", + " Note that in the models we use sufficient statistics are always defined \n", + " for nodes (i.e. degree sequence in the standard configuration model).\n", + "2. Sample $R$ randomized realization from a null model of choice.\n", + "3. Index nodes in each randomized graph with values of their corresponding\n", + " sufficient statistics.\n", + "4. Calculate graph statistics on $R$ randomized graph.\n", + "5. Group simulated data by unique values of sufficient statistics.\n", + " In the case of nodes these are original sufficient statistics\n", + " (always defined for nodes in our case) and in the case of edges\n", + " these are unique combinations of sufficient statistics\n", + " (possibly coarse grained to avoid having too sparse data).\n", + " In the case of graph-level statistics no grouping is necessary.\n", + "6. Compare observed values against the simulated values grouped\n", + " as described above where values for individual nodes/edges\n", + " are compared against distributions corresponding to their\n", + " values of sufficient statistics.\n", + "\n", + "**NOTE.** In this approach only one-sided tests are really possible.\n", + " Moreover, estimating p-values for edge-wise statistics\n", + " is currently problematic due to the need of coarse-graining\n", + " (and it is not yet clear what is the optimal strategy).\n", + "\n", + "In practice we usually implement the above procedure using a helper\n", + "`Inference` class defined in `pathcensus` package which abstracts away\n", + "most of tedious programming logic and requires the end user to implement\n", + "only `statistics` method defining the actual graph statistics we want\n", + "to calculate.\n", + "\n", + "Below we do this for the sake of the test of $p$-values estimation methods." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "832e1de1", + "metadata": {}, + "outputs": [], + "source": [ + "def statistics(graph, method, *args, **kwds):\n", + " \"\"\"Function computing graph statistics for which inference\n", + " is to be run.\n", + " \n", + " Parameters\n", + " ----------\n", + " graph\n", + " A graph-like object.\n", + " method\n", + " Name of the method (string) defined on \n", + " :py:class:`pathcensus.PathCensus`.\n", + " \n", + " Returns\n", + " -------\n", + " data\n", + " Data frame or series with grah statistics.\n", + " \"\"\"\n", + " paths = PathCensus(graph)\n", + " method = getattr(paths, method)\n", + " return method(*args, **kwds) " + ] + }, + { + "cell_type": "markdown", + "id": "b651e17d", + "metadata": {}, + "source": [ + "## ER random graph\n", + "\n", + "In this case we expect all coefficients to be insignificant.\n", + "We use one-sided tests checking with \"greater\" alternative hypothesis." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "deefaa46", + "metadata": {}, + "outputs": [], + "source": [ + "# This sets seed of random, numpy and numba\n", + "set_seed(371)\n", + "\n", + "graph = make_er_graph(N_NODES, KBAR)\n", + "ubcm = UBCM(graph)\n", + "ubcm.fit()\n", + "\n", + "infer = Inference(graph, ubcm, statistics)\n", + "null_kws = dict(progress=True)" + ] + }, + { + "cell_type": "markdown", + "id": "79ec1f1f", + "metadata": {}, + "source": [ + "### Node-wise coefficients" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ba3c02c1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 100/100 [00:04<00:00, 22.33it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "(0.0, True)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ONE-TAILED TEST FOR NODE-WISE STRUCTURAL COEFFICIENTS\n", + "data, null = infer.init_comparison(\n", + " n=N_SAMPLES,\n", + " method=\"coefs\",\n", + " mode=\"nodes\",\n", + " null_kws=null_kws\n", + ")\n", + "pvals = infer.estimate_pvalues(data, null, alternative=\"greater\")\n", + "\n", + "# CHECK IF THE FRACTION OF SIGNIFICANT VALUES\n", + "# DOES NOT EXCEED ALPHA\n", + "pvals_frac = (pvals.values <= ALPHA).mean()\n", + "pvals_frac, pvals_frac <= ALPHA" + ] + }, + { + "cell_type": "markdown", + "id": "f9cfb943", + "metadata": {}, + "source": [ + "### Global coefficients" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bbf4b808", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 100/100 [00:04<00:00, 20.76it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "(0.0, True)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ONE-TAILED TEST FOR GLOBAL STRUCTURAL COEFFICIENTS\n", + "data, null = infer.init_comparison(\n", + " n=N_SAMPLES,\n", + " method=\"coefs\",\n", + " mode=\"global\",\n", + " null_kws=null_kws\n", + ")\n", + "pvals = infer.estimate_pvalues(data, null, alternative=\"greater\")\n", + "\n", + "# CHECK IF THE FRACTION OF SIGNIFICANT VALUES\n", + "# DOES NOT EXCEED ALPHA\n", + "pvals_frac = (pvals.values <= ALPHA).mean()\n", + "pvals_frac, pvals_frac <= ALPHA" + ] + }, + { + "cell_type": "markdown", + "id": "ab78509b", + "metadata": {}, + "source": [ + "## Random geometric graph\n", + "\n", + "In this case we expect significant presence of similarity\n", + "and not significant results for complementarity." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "67e2aec4", + "metadata": {}, + "outputs": [], + "source": [ + "set_seed(7171)\n", + "\n", + "graph = make_rgg(N_NODES, KBAR)\n", + "ubcm = UBCM(graph)\n", + "ubcm.fit()\n", + "\n", + "infer = Inference(graph, ubcm, statistics)" + ] + }, + { + "cell_type": "markdown", + "id": "a5965e0f", + "metadata": {}, + "source": [ + "### Node-wise similarity\n", + "\n", + "In this case we expect a fraction of significant results greater than $\\alpha$\n", + "using a one-sided test." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "55ee4fed", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 100/100 [00:03<00:00, 32.34it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "(0.9833333333333333, True)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ONE-TAILED TEST FOR NODE-WISE SIMILARITY COEFFICIENTS\n", + "# (W0-COMPLEMENTARITY IS USED)\n", + "data, null = infer.init_comparison(\n", + " n=N_SAMPLES,\n", + " method=\"simcoefs\",\n", + " mode=\"nodes\",\n", + " null_kws=null_kws\n", + ")\n", + "pvals = infer.estimate_pvalues(data, null, alternative=\"greater\")\n", + "\n", + "# CHECK IF THE FRACTION OF SIGNIFICANT VALUES\n", + "# EXCEED ALPHA\n", + "pvals_frac = (pvals.values <= ALPHA).mean()\n", + "pvals_frac, pvals_frac > ALPHA" + ] + }, + { + "cell_type": "markdown", + "id": "c8e6f64f", + "metadata": {}, + "source": [ + "### Global similarity" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f8d8c329", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 100/100 [00:02<00:00, 34.64it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "(1.0, True)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ONE-TAILED TEST FOR NODE-WISE SIMILARITY COEFFICIENTS\n", + "# (W0-COMPLEMENTARITY IS USED)\n", + "data, null = infer.init_comparison(\n", + " n=N_SAMPLES,\n", + " method=\"simcoefs\",\n", + " mode=\"global\",\n", + " null_kws=null_kws\n", + ")\n", + "pvals = infer.estimate_pvalues(data, null, alternative=\"greater\")\n", + "\n", + "# CHECK IF THE FRACTION OF SIGNIFICANT VALUES\n", + "# EXCEED ALPHA\n", + "pvals_frac = (pvals.values <= ALPHA).mean()\n", + "pvals_frac, pvals_frac > ALPHA" + ] + }, + { + "cell_type": "markdown", + "id": "c202decc", + "metadata": {}, + "source": [ + "### Node-wise complementarity\n", + "\n", + "We expect no significant results." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5aa7c84b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 100/100 [00:02<00:00, 39.55it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "(0.0, True)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ONE-TAILED TEST FOR EDGE-WISE SIMILARITY COEFFICIENTS\n", + "# (W0-COMPLEMENTARITY IS USED)\n", + "data, null = infer.init_comparison(\n", + " n=N_SAMPLES,\n", + " method=\"compcoefs\",\n", + " mode=\"nodes\",\n", + " null_kws=null_kws\n", + ")\n", + "pvals = infer.estimate_pvalues(data, null, alternative=\"greater\")\n", + "\n", + "# CHECK IF THE FRACTION OF SIGNIFICANT VALUES\n", + "# EXCEED ALPHA\n", + "pvals_frac = (pvals.values <= ALPHA).mean()\n", + "pvals_frac, pvals_frac <= ALPHA" + ] + }, + { + "cell_type": "markdown", + "id": "3c97338c", + "metadata": {}, + "source": [ + "### Global complementarity" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "40bcbd91", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 100/100 [00:03<00:00, 31.10it/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "(0.0, True)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# ONE-TAILED TEST FOR EDGE-WISE SIMILARITY COEFFICIENTS\n", + "# (W0-COMPLEMENTARITY IS USED)\n", + "data, null = infer.init_comparison(\n", + " n=N_SAMPLES,\n", + " method=\"compcoefs\",\n", + " mode=\"global\",\n", + " null_kws=null_kws\n", + ")\n", + "pvals = infer.estimate_pvalues(data, null, alternative=\"greater\")\n", + "\n", + "# CHECK IF THE FRACTION OF SIGNIFICANT VALUES\n", + "# EXCEED ALPHA\n", + "pvals_frac = (pvals.values <= ALPHA).mean()\n", + "pvals_frac, pvals_frac <= ALPHA" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "8e0900f3da58897fd209d64081d3031deeb7de3d74b04d540bcaacc585fcac50" + }, + "kernelspec": { + "display_name": "Python 3.9.7 64-bit ('ssc-paper': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + }, + "orig_nbformat": 3 + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/4-er-random-graphs.ipynb b/examples/4-er-random-graphs.ipynb new file mode 100644 index 0000000..bcbec61 --- /dev/null +++ b/examples/4-er-random-graphs.ipynb @@ -0,0 +1,739 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Null models\n", + "\n", + "In this notebook we show several basic results concerning the behavior\n", + "of quadrangular structural coefficients in standard null models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global $q$-clustering in Erdős–Rényi (ER) random graphs\n", + "\n", + "It is a well known fact that in ER random graphs expected global clustering\n", + "is equal to $p$, that is, to the edge exsitence probability.\n", + "\n", + "It can be deduced simply from the fact that for any $2$-path:\n", + "\n", + "```\n", + "i -- j -- k\n", + "```\n", + "\n", + "the probability that also `i` and `k` are connected is still equal to $p$\n", + "as it is constant for all possible edges. Hence, each $2$-path is closed\n", + "to make a triangle with probability of exactly $p$ resulting in the global\n", + "clustering equal to $p$.\n", + "\n", + "Similarly, for any $3$-path:\n", + "\n", + "```\n", + " i l\n", + " | |\n", + " j ---- k\n", + "```\n", + "\n", + "probability that it is closed and form a strict quadrangle is equal to\n", + "the probability that `i` and `k` as well as `j` and `l` are not connected\n", + "which is $(1-p)^2$ times the probability that `i` and `l` are connected\n", + "which $p$. So this gives expected quadrangular clustering\n", + "(global complementarity) equal to\n", + "$p(1-p)^2$. Note that for graphs with low value of $p$ we have that\n", + "$(1-p) \\approx 1$, so both expected clustering and $q$-clustering\n", + "are comparable." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Simulation\n", + "\n", + "We check the above assertion by simulating $100$ ER random graphs with\n", + "$2000$ nodes and different values of $p = .01, .02, .05$.\n", + "\n", + "**NOTE.** This may take a while to execute." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "import pandas as pd\n", + "import igraph as ig\n", + "from tqdm import tqdm\n", + "from pathcensus import PathCensus" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 20/20 [00:16<00:00, 1.24it/s]\n", + "100%|██████████| 20/20 [00:05<00:00, 3.54it/s]\n", + "100%|██████████| 20/20 [00:22<00:00, 1.15s/it]\n", + "100%|██████████| 20/20 [01:57<00:00, 5.88s/it]\n" + ] + } + ], + "source": [ + "random.seed(101)\n", + "\n", + "P = (.001, .002, .005, .01)\n", + "N = 5000\n", + "R = 20\n", + "\n", + "er_null = []\n", + "\n", + "for p in P:\n", + " for _ in tqdm(range(R)):\n", + " graph = ig.Graph.Erdos_Renyi(N, p=p, directed=False)\n", + " paths = PathCensus(graph) \n", + " index = dict(p=p, q=p*(1-p)**2)\n", + " df = paths.coefs(\"global\")\n", + " for k, v in index.items():\n", + " df[k] = v\n", + " index[\"q\"] = p\n", + " er_null.append(df)\n", + "\n", + "data = pd.concat(er_null, axis=0, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "sim = data[[\"p\", \"sim_g\", \"sim\", \"tclust\", \"tclosure\"]] \\\n", + " .reset_index() \\\n", + " .melt(id_vars=[\"index\", \"p\"]) \\\n", + " .assign(relerr=lambda df: abs(df[\"value\"] - df[\"p\"]) / df[\"p\"])\n", + " \n", + "comp = data[[\n", + " \"q\", \"comp_g\", \"comp\", \"qclust\", \"qclosure\"\n", + "]] \\\n", + " .reset_index() \\\n", + " .melt(id_vars=[\"index\", \"q\"]) \\\n", + " .assign(relerr=lambda df: abs(df[\"value\"] - df[\"q\"]) / df[\"q\"]) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Similarity coefficients vis-a-vis density in ER random graph ($p$)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeanstdmin25%50%75%max
pvariable
0.001sim20.00.0007950.0002150.0004050.0006520.0007900.0008750.001328
sim_g20.00.0009650.0002490.0004570.0008110.0009200.0010880.001517
tclosure20.00.0007710.0001990.0003820.0006520.0007560.0008580.001232
tclust20.00.0009940.0003350.0005260.0007570.0009630.0011000.001937
0.002sim20.00.0018490.0001180.0015880.0018060.0018720.0019030.002052
sim_g20.00.0020280.0001330.0017480.0020060.0020560.0021020.002242
tclosure20.00.0018090.0001180.0015510.0017750.0018400.0018590.002009
tclust20.00.0020370.0001270.0017420.0019790.0020780.0021140.002253
0.005sim20.00.0048320.0000790.0047250.0047660.0048190.0048860.005032
sim_g20.00.0049930.0000830.0048940.0049270.0049700.0050360.005195
tclosure20.00.0047860.0000780.0046900.0047180.0047720.0048370.004981
tclust20.00.0049930.0000810.0048680.0049300.0049790.0050490.005205
0.010sim20.00.0098610.0000690.0097470.0098200.0098570.0099070.009996
sim_g20.00.0100110.0000690.0098810.0099740.0100070.0100580.010138
tclosure20.00.0098130.0000690.0096950.0097760.0098100.0098570.009946
tclust20.00.0100160.0000700.0099020.0099700.0100120.0100640.010155
\n", + "
" + ], + "text/plain": [ + " count mean std min 25% 50% \\\n", + "p variable \n", + "0.001 sim 20.0 0.000795 0.000215 0.000405 0.000652 0.000790 \n", + " sim_g 20.0 0.000965 0.000249 0.000457 0.000811 0.000920 \n", + " tclosure 20.0 0.000771 0.000199 0.000382 0.000652 0.000756 \n", + " tclust 20.0 0.000994 0.000335 0.000526 0.000757 0.000963 \n", + "0.002 sim 20.0 0.001849 0.000118 0.001588 0.001806 0.001872 \n", + " sim_g 20.0 0.002028 0.000133 0.001748 0.002006 0.002056 \n", + " tclosure 20.0 0.001809 0.000118 0.001551 0.001775 0.001840 \n", + " tclust 20.0 0.002037 0.000127 0.001742 0.001979 0.002078 \n", + "0.005 sim 20.0 0.004832 0.000079 0.004725 0.004766 0.004819 \n", + " sim_g 20.0 0.004993 0.000083 0.004894 0.004927 0.004970 \n", + " tclosure 20.0 0.004786 0.000078 0.004690 0.004718 0.004772 \n", + " tclust 20.0 0.004993 0.000081 0.004868 0.004930 0.004979 \n", + "0.010 sim 20.0 0.009861 0.000069 0.009747 0.009820 0.009857 \n", + " sim_g 20.0 0.010011 0.000069 0.009881 0.009974 0.010007 \n", + " tclosure 20.0 0.009813 0.000069 0.009695 0.009776 0.009810 \n", + " tclust 20.0 0.010016 0.000070 0.009902 0.009970 0.010012 \n", + "\n", + " 75% max \n", + "p variable \n", + "0.001 sim 0.000875 0.001328 \n", + " sim_g 0.001088 0.001517 \n", + " tclosure 0.000858 0.001232 \n", + " tclust 0.001100 0.001937 \n", + "0.002 sim 0.001903 0.002052 \n", + " sim_g 0.002102 0.002242 \n", + " tclosure 0.001859 0.002009 \n", + " tclust 0.002114 0.002253 \n", + "0.005 sim 0.004886 0.005032 \n", + " sim_g 0.005036 0.005195 \n", + " tclosure 0.004837 0.004981 \n", + " tclust 0.005049 0.005205 \n", + "0.010 sim 0.009907 0.009996 \n", + " sim_g 0.010058 0.010138 \n", + " tclosure 0.009857 0.009946 \n", + " tclust 0.010064 0.010155 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sim.groupby([\"p\", \"variable\"])[\"value\"].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Complementarity coefficients vis-a-vis $p(1-p)^2$ in ER random graphs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeanstdmin25%50%75%max
qvariable
0.000998comp20.00.0007770.0001040.0005580.0007130.0007720.0008750.000942
comp_g20.00.0009920.0001220.0007340.0009100.0009990.0010890.001205
qclosure20.00.0007640.0000990.0005590.0007050.0007700.0008460.000921
qclust20.00.0009400.0001380.0006530.0008500.0009140.0010530.001168
0.001992comp20.00.0017980.0000430.0017320.0017620.0018000.0018380.001853
comp_g20.00.0019820.0000480.0018960.0019600.0019740.0020280.002043
qclosure20.00.0017650.0000420.0016950.0017320.0017650.0018080.001818
qclust20.00.0019600.0000490.0018810.0019260.0019620.0019970.002037
0.004950comp20.00.0047880.0000300.0047330.0047750.0047870.0048120.004838
comp_g20.00.0049470.0000320.0048930.0049340.0049470.0049680.005005
qclosure20.00.0047440.0000300.0046910.0047300.0047440.0047660.004793
qclust20.00.0049430.0000320.0048820.0049300.0049420.0049690.004998
0.009801comp20.00.0096560.0000280.0096080.0096350.0096590.0096710.009709
comp_g20.00.0098030.0000290.0097500.0097800.0098090.0098170.009861
qclosure20.00.0096100.0000270.0095630.0095900.0096120.0096250.009662
qclust20.00.0098040.0000290.0097510.0097810.0098080.0098180.009861
\n", + "
" + ], + "text/plain": [ + " count mean std min 25% 50% \\\n", + "q variable \n", + "0.000998 comp 20.0 0.000777 0.000104 0.000558 0.000713 0.000772 \n", + " comp_g 20.0 0.000992 0.000122 0.000734 0.000910 0.000999 \n", + " qclosure 20.0 0.000764 0.000099 0.000559 0.000705 0.000770 \n", + " qclust 20.0 0.000940 0.000138 0.000653 0.000850 0.000914 \n", + "0.001992 comp 20.0 0.001798 0.000043 0.001732 0.001762 0.001800 \n", + " comp_g 20.0 0.001982 0.000048 0.001896 0.001960 0.001974 \n", + " qclosure 20.0 0.001765 0.000042 0.001695 0.001732 0.001765 \n", + " qclust 20.0 0.001960 0.000049 0.001881 0.001926 0.001962 \n", + "0.004950 comp 20.0 0.004788 0.000030 0.004733 0.004775 0.004787 \n", + " comp_g 20.0 0.004947 0.000032 0.004893 0.004934 0.004947 \n", + " qclosure 20.0 0.004744 0.000030 0.004691 0.004730 0.004744 \n", + " qclust 20.0 0.004943 0.000032 0.004882 0.004930 0.004942 \n", + "0.009801 comp 20.0 0.009656 0.000028 0.009608 0.009635 0.009659 \n", + " comp_g 20.0 0.009803 0.000029 0.009750 0.009780 0.009809 \n", + " qclosure 20.0 0.009610 0.000027 0.009563 0.009590 0.009612 \n", + " qclust 20.0 0.009804 0.000029 0.009751 0.009781 0.009808 \n", + "\n", + " 75% max \n", + "q variable \n", + "0.000998 comp 0.000875 0.000942 \n", + " comp_g 0.001089 0.001205 \n", + " qclosure 0.000846 0.000921 \n", + " qclust 0.001053 0.001168 \n", + "0.001992 comp 0.001838 0.001853 \n", + " comp_g 0.002028 0.002043 \n", + " qclosure 0.001808 0.001818 \n", + " qclust 0.001997 0.002037 \n", + "0.004950 comp 0.004812 0.004838 \n", + " comp_g 0.004968 0.005005 \n", + " qclosure 0.004766 0.004793 \n", + " qclust 0.004969 0.004998 \n", + "0.009801 comp 0.009671 0.009709 \n", + " comp_g 0.009817 0.009861 \n", + " qclosure 0.009625 0.009662 \n", + " qclust 0.009818 0.009861 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comp.groupby([\"q\", \"variable\"])[\"value\"].describe().round(6)" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "8e0900f3da58897fd209d64081d3031deeb7de3d74b04d540bcaacc585fcac50" + }, + "kernelspec": { + "display_name": "Python 3.9.7 64-bit ('ssc-paper': conda)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.2" + }, + "orig_nbformat": 2 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pathcensus/__init__.py b/pathcensus/__init__.py new file mode 100644 index 0000000..bd280cd --- /dev/null +++ b/pathcensus/__init__.py @@ -0,0 +1,9 @@ +__author__ = "Szymon Talaga" +__email__ = "stalaga@protonmail.com" +__version__ = "0.1" + + +from .graph import GraphABC, adjacency +from .pathcensus import PathCensus +from .nullmodels import UBCM, UECM +from .inference import Inference diff --git a/pathcensus/core/__init__.py b/pathcensus/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pathcensus/core/graph.py b/pathcensus/core/graph.py new file mode 100644 index 0000000..a962c14 --- /dev/null +++ b/pathcensus/core/graph.py @@ -0,0 +1,408 @@ +"""Simple JIT-compiled graph class for calculating path census.""" +from typing import Optional, Tuple +import numpy as np +import numba +from numba.typed import List # pylint: disable=no-name-in-module +from numba.experimental import jitclass +from .types import UInt, Float +from ..definitions import PathDefinitionsUnweighted, PathDefinitionsWeighted + + +_NPATHS_UNWEIGHTED = PathDefinitionsUnweighted().npaths +_NPATHS_WEIGHTED = PathDefinitionsWeighted().npaths + + +@jitclass([ + ("E", UInt[:, ::1]), + ("D", UInt[::1]), + ("W", numba.optional(Float[::1])), + ("S", numba.optional(Float[::1])), + ("_strides", UInt[::1]) +]) +class Graph: + """Graph represented as an edgelist. The edgelist representation + is efficient for edge-oriented algorithms such as path census. + + Attributes + ---------- + n_nodes + Number of nodes in the graph. + E + Edgelist as 2D C-contiguous array of + unsigned 64-bit integers. It has to be sorted + by head indices. + W + Optional 1D array of 64-bit floats storing edge weights. + """ + def __init__( + self, + n_nodes: int, + E: np.ndarray, + W: Optional[np.ndarray] = None + ) -> None: + # Sort edge array so i <= j + # and save sorting order indices for sorting weight array + o1, o2 = self._get_ij_ordering(E) + E = E[o1][o2] + + # Add edge ids + eid = np.arange(len(E), dtype=UInt) + _E = np.ascontiguousarray(np.column_stack((eid, E))) + self.E = _E + + # Make strides array for efficient access + # to node neighborhoods. + self._strides = self._make_strides(n_nodes, self.E) + + # Set node degree array + self.D = self.degree() + + if W is not None: + if self.E.shape[0] != W.shape[0]: + raise AttributeError("'E' and 'W' have to be of the same length") + _W = W[o1][o2] + self.W = np.ascontiguousarray(_W) + # Set node strength array + self.S = self.strength() + else: + self.W = None + self.S = None + + # Properties -------------------------------------------------------------- + + @property + def vcount(self): + """Number of nodes.""" + return len(self._strides) - 1 + @property + def n_nodes(self): + return self.vcount + + @property + def ecount(self): + """Number of edges.""" + return len(self.E) // 2 + @property + def n_edges(self): + return self.ecount + + @property + def vids(self): + return np.arange(self.vcount, dtype=UInt) + + @property + def eids(self): + return np.arange(self.ecount, dtype=UInt) + + @property + def directed(self) -> bool: + return False + + @property + def weighted(self) -> bool: + return self.W is not None + + @property + def npaths(self) -> int: + if self.weighted: + return _NPATHS_WEIGHTED + return _NPATHS_UNWEIGHTED + + # Methods ----------------------------------------------------------------- + + def get_edges(self) -> np.ndarray: + """Get undirected edges without self-loops (`i < j`).""" + E = self.E + mask = E[:, 1] < E[:, 2] + return E[mask] + + def get_min_di_edges(self) -> np.ndarray: + """Get undirected edges without self loops (`di <= dj`).""" + E = self.E + D = self.D + i = E[:, 1] + j = E[:, 2] + di = D[i] + dj = D[j] + + mask = (di < dj) | ((di == dj) & (i < j)) + return E[mask] + + def degree(self) -> np.ndarray: + """Get node degrees.""" + return self._strides[1:] - self._strides[:-1] + + def strength(self) -> np.ndarray: + """Get node strengths.""" + S = np.empty((self.vcount,), dtype=Float) + for i, seq in enumerate(zip(self._strides[:-1], self._strides[1:])): + start, end = seq + S[i] = self.W[start:end].sum() + return S + + def N(self, i: int) -> np.ndarray: + """Get 1-neighborhood of ``i``.""" + start, end = self._strides[i:i+2] + return self.E[start:end, np.array([0, 2])] + + # Path counting methods --------------------------------------------------- + + def _count_paths_unweighted( + self, + E: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: + """Count unweighted paths. + + Parameters + ---------- + E + Array with edge and source/target indices + to consider in the calculations. + + Returns + ------- + E + 2D array of edges as source and target indices (only i < j). + counts + 2D array with path counts per edge. + """ + # pylint: disable=too-many-locals,too-many-branches,too-many-statements + counts = np.empty((len(E), self.npaths), dtype=Float) + # Array for keeping track of node roles + role = np.zeros((self.n_nodes,), dtype=np.uint8) + + D = self.D # degree sequence + + # Main loop + for idx, edge in enumerate(E): + _, i, j = edge + + _zero = Float(0) + + t = tw = th = _zero + q = qw = qh = _zero + + star_i = set(List.empty_list(UInt)) + star_j = set(List.empty_list(UInt)) + tri_ij = set(List.empty_list(UInt)) + + for _, k in self.N(i): + if k == j: + continue + + star_i.add(k) + role[k] = 1 + # Count wedge triples + tw += 1 + + for _, k in self.N(j): + if k == i: + continue + + if role[k] == 1: + # Count triangles + t += 1 + star_i.remove(k) + tri_ij.add(k) + role[k] = 3 + else: + star_j.add(k) + role[k] = 2 + + # Count head triples + th += 1 + + for k in star_i: + for _, l in self.N(k): + if l == i: + continue + # Count strong quadrangles + if role[l] == 2: + q += 1 + + # Count wedge and head quadruples + # and clear `role` vector. + for k in star_i: + qw += D[k] - 1 + role[k] = 0 + for k in star_j: + qh += D[k] - 1 + role[k] = 0 + for k in tri_ij: + n = D[k] - 2 + qw += n + qh += n + role[k] = 0 + + counts[idx] = ( + t, tw, th, + q, qw, qh + ) + + return E, counts + + def _count_paths_weighted( + self, + E: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: + """Count weighted paths. + + Parameters + ---------- + E + Array with edge and source/target indices + to consider in the calculations. + + Returns + ------- + E + 2D array of edges as source and target indices (only i < j). + counts + 2D array with weighted path counts per edge. + """ + # pylint: disable=too-many-locals,too-many-branches,too-many-statements + counts = np.zeros((len(E), self.npaths), dtype=Float) + # Array for keeping track of node roles + role = np.zeros((self.n_nodes,), dtype=np.uint8) + + D = self.D # degree sequence + S = self.S # strength sequence + W = self.W # edge weights array + + # Weight arrays for links from `i` and `j` to their neighbors + Wi = np.empty((self.n_nodes,), dtype=Float) + Wj = np.empty((self.n_nodes,), dtype=Float) + + # Main loop + for idx, edge in enumerate(E): + ij, i, j = edge + wij = W[ij] + + _zero = Float(0) + + twc = thc = tw = th = _zero + q0wc = q0hc = qw = qh = _zero + + star_i = set(List.empty_list(UInt)) + star_j = set(List.empty_list(UInt)) + tri_ij = set(List.empty_list(UInt)) + + for ik, k in self.N(i): + if k == j: + continue + star_i.add(k) + role[k] = 1 + wik = W[ik] + Wi[k] = wik + # Count wedge triples + w = (wij + wik) / 2 + tw += w + + for jk, k in self.N(j): + if k == i: + continue + wjk = W[jk] + Wj[k] = wjk + w = (wij + wjk) / 2 + # Handle triangles + if role[k] == 1: + star_i.remove(k) + tri_ij.add(k) + role[k] = 3 + # Count triangles + # both with wedge and head weighting + twc += (wij + Wi[k]) / 2 + thc += w + else: + star_j.add(k) + role[k] = 2 + # Count head triples + th += w + + for k in star_i: + wik = Wi[k] + for kl, l in self.N(k): + if l == i: + continue + # Count strong quadrangles + if role[l] == 2: + wkl = W[kl] + wjl = Wj[l] + q0wc += (wij + wik + wkl) / 3 + q0hc += (wij + wjl + wkl) / 3 + + # Count wedge and head quadruples of `i` + # and clear `role` vector. + for k in star_i: + wik = Wi[k] + qw += ((D[k]-1)*(wij+wik) + S[k] - wik) / 3 + role[k] = 0 + for k in star_j: + wjk = Wj[k] + qh += ((D[k]-1)*(wij+wjk) + S[k] - wjk) / 3 + role[k] = 0 + for k in tri_ij: + wik = Wi[k] + wjk = Wj[k] + dk = D[k] + sk = S[k] + qw += ((dk-2)*(wij+wik) + sk - wjk - wik) / 3 + qh += ((dk-2)*(wij+wjk) + sk - wjk - wik) / 3 + role[k] = 0 + + counts[idx] = ( + twc, thc, tw, th, + q0wc, q0hc, qw, qh + ) + + return E, counts + + def count_paths( + self, + E: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: + """Count paths. + + Parameters + ---------- + E + Array with edge and source/target indices + to consider in the calculations. + + Returns + ------- + E + 2D array of edges as source and target indices (only i < j). + paths + 2D array with weighted path counts per edge. + """ + if self.weighted: + E, counts = self._count_paths_weighted(E) + else: + E, counts = self._count_paths_unweighted(E) + return ( + np.ascontiguousarray(E[:, 1:]), + np.ascontiguousarray(counts) + ) + + # Internals --------------------------------------------------------------- + + def _make_strides(self, n_nodes, E): + strides = np.zeros((n_nodes+1,), dtype=UInt) + last = eid = UInt(0) + for eid, i in E[:, :2]: + diff = i - last + if diff > 0: + strides[i] = eid + if diff > 1: + for k in range(last+1, i): + strides[k] = eid + last = i + for k in range(last+1, len(strides)): + strides[k] = len(E) + return strides + + def _get_ij_ordering(self, E): + o1 = np.argsort(E[:, 1]) + o2 = np.argsort(E[o1, 0]) + return o1, o2 diff --git a/pathcensus/core/parallel.py b/pathcensus/core/parallel.py new file mode 100644 index 0000000..66c5818 --- /dev/null +++ b/pathcensus/core/parallel.py @@ -0,0 +1,58 @@ +"""Internal routines for path/cycle counting.""" +from typing import Tuple +import numpy as np +import numba +from .graph import Graph +from .types import Float + + +@numba.njit(parallel=True, boundscheck=False, nogil=True, cache=False) +def count_paths_parallel( + graph: Graph, + batch_size: int = 10, + min_di: bool = True, + shuffle: bool = True +) -> Tuple[np.ndarray, np.ndarray]: + """Count paths and cycles using parallel algorithm. + + Parameters + ---------- + graph + Compiled :py:class:`pathcensus.core.graph.Graph` instance. + batch_size + Number of edges processed in one batch. + Usually should not be very large, but also cannot be too small. + The default value often works quite well. + min_di + Should `di < dj` rule for iterating over edges be used. + See :meth:`pathcensus.PathCensus.count_paths` for details. + Almost always should be set to ``True``. + The argument is used mostly for testing purposes. + shuffle + Should rows of the edge array be first reshuffled randomly. + This often improves performance by decreasing the likelihood + of concurrent accesses to the same elements of the edge array + by different threads. + """ + if min_di: + E = graph.get_min_di_edges() + else: + E = graph.get_edges() + + if shuffle: + np.random.shuffle(E) + + n_edges = len(E) + counts = np.zeros((n_edges, graph.npaths), dtype=Float) + n_batches = int(np.ceil(n_edges / batch_size)) + + for i in numba.prange(n_batches): # pylint: disable=not-an-iterable + start = batch_size * i + batch = E[start:start+batch_size] + _, _counts = graph.count_paths(batch) + counts[start:start+batch_size] = _counts + + return ( + np.ascontiguousarray(E[:, 1:]), + np.ascontiguousarray(counts) + ) diff --git a/pathcensus/core/random.py b/pathcensus/core/random.py new file mode 100644 index 0000000..8118bb6 --- /dev/null +++ b/pathcensus/core/random.py @@ -0,0 +1,8 @@ +"""Numba random number generator utilities.""" +import numpy as np +from numba import njit + +@njit +def set_numba_seed(seed: int) -> None: + """Set seed for Numba random numbers generator.""" + np.random.seed(seed) diff --git a/pathcensus/core/types.py b/pathcensus/core/types.py new file mode 100644 index 0000000..ba31435 --- /dev/null +++ b/pathcensus/core/types.py @@ -0,0 +1,6 @@ +"""Core types used by compiled code.""" +import numba +from ..types import UInt, Float + +UInt = numba.from_dtype(UInt) +Float = numba.from_dtype(Float) diff --git a/pathcensus/definitions.py b/pathcensus/definitions.py new file mode 100644 index 0000000..954b274 --- /dev/null +++ b/pathcensus/definitions.py @@ -0,0 +1,198 @@ +"""Path counting and aggregation definitions.""" +from typing import List, Tuple, Dict, Iterable + + +class PathDefinitions: + """Base class for path definitions. + + See Also + -------- + pathcensus.definitions.PathDefinitionsUnweighted : unweighted definitions + pathcensus.definitions.PathDefinitionsWeighted : weighted definitions + """ + __instance = None + + def __new__(cls): + if cls.__instance is None: + instance = super().__new__(cls) + instance.__init__() + cls.__instance = instance + return cls.__instance + + def __iter__(self) -> Iterable[str]: + yield from self.definitions["sim"] + yield from self.definitions["comp"] + + def __getitem__(self, key): + return self.definitions[key] + + @property + def definitions(self) -> Dict: + """Raw path definitions. Weighted and unweighted definitions + are derived from this. + """ + return dict( + sim=("twc", "thc", "tw", "th"), + comp=("q0wc", "q0hc", "qw", "qh"), + swap=( + ("twc", "thc"), ("tw", "th"), + ("q0wc", "q0hc"), ("qw", "qh"), + ) + ) + + @property + def aggregation(self) -> Dict: + """Aggregation rules for computing node and global counts + from edge counts. The integers specify the factor to divide + by the sum over edge counts to get a corresponding node/global count. + """ + tri = ("twc", "thc") + quad = ( + "q0wc", "q0hc", + ) + opaths = ("tw", "th", "qw", "qh") + dct = { + "nodes": { + **{ k: 2 for k in (*tri, *quad) }, + }, + "global": { + **{ k: 6 for k in tri }, + **{ k: 8 for k in quad }, + **{ k: 2 for k in opaths } + } + } + out = {} + for mode, agg in dct.items(): + d = {} + for k, v in agg.items(): + d[self.resolve(k)] = v + out[mode] = d + return out + + @property + def aliases(self) -> Dict: + """Aliases mapping actual path names to raw names.""" + return {} + + @property + def npaths(self) -> int: + """Number of different path/cycle counts.""" + return len(self.definitions["sim"]) + len(self.definitions["comp"]) + + def resolve(self, name) -> str: + """Resolve path name alias.""" + if name in self.aliases: + return self.aliases[name] + if name in self.list(): + return name + raise ValueError(f"incorrect path name '{name}'") + + def list(self) -> List[str]: + """List path names.""" + return list(self) + + def enumerate(self) -> List[Tuple[str, int]]: + """Enumerate path names.""" + return [ (p, i) for i, p in enumerate(self) ] + + def get_swap_rules(self) -> List[Tuple[int, int]]: + """Get swap rules for counting reversed paths. + + They define indices of pairs of columns which need to be + swaped in order to get reversed paths. Note that reverses + of wedge paths are head paths and vice versa. In the case + of weighted paths also wedge/head cycle counts need to be + reversed. + """ + omap = dict(self.enumerate()) + return [ + (omap[left], omap[right]) + for left, right in self["swap"] + ] + + def get_column_names(self) -> List[str]: + """Get names of path columns used once the reverse counting + is done. + """ + return list(self) + + def get_column_ids(self) -> List[int]: + """Get indices of path columns to leave + once reversed counting is done. + """ + omap = dict(self.enumerate()) + return [ omap[path] for path in self.get_column_names() ] + + +class PathDefinitionsUnweighted(PathDefinitions): + """Unweighted path definitions. + + **Similarity-related paths** + + t + Triangles. + tw + Wedge-triples around ``i`` (i.e. ``k-i-j`` paths). + th + Head-triples originating from ``i`` (i.e. ``i-j-k`` paths). + + **Complementarity-related paths** + + q0 + Strong quadrangles. + qw + Wedge-quadruples around ``i`` (i.e. ``k-i-j-l`` paths). + qh + Head-quadruples originating from ``i`` (i.e. ``i-j-k-l`` paths). + """ + @property + def definitions(self) -> Dict: + return dict( + sim=("t", "tw", "th"), + comp=( + "q0", "qw", "qh" + ), + swap=( + ("tw", "th"), + ("qw", "qh") + ) + ) + + @property + def aliases(self) -> Dict: + return { + "twc": "t", + "thc": "t", + "q0wc": "q0", + "q0hc": "q0", + } + + +class PathDefinitionsWeighted(PathDefinitions): + """Weighted path definitions. + + **Similarity-related paths** + + twc + Closed wedge triples or triangles weighted by ``ij`` and ``ik`` edges. + thc + Closed head triples or triangles weighted by ``ij`` and ``jk`` edges. + tw + Wedge triples. + th + Head triples. + + + **Complementarity-related paths** + + q0wc + Closed wedge quadruples with no chords (strong quadrangles) + weighted by ``ij``, ``jk``, and ``il`` edges. + qw + Wedge quadruples. + q0hc + Closed head quadruples with no chords (strong quadrangles) + weighted by ``ij``, ``jk`` and ``kl`` edges. + qh + Head quadruples. + """ diff --git a/pathcensus/graph.py b/pathcensus/graph.py new file mode 100644 index 0000000..c76a76c --- /dev/null +++ b/pathcensus/graph.py @@ -0,0 +1,190 @@ +"""Arbitrary classes of which instances can be interpreted +as :py:mod:`scipy` sparse matrices or 2D square :py:mod:`numpy` arrays +can be registered as abstract subclasses of :class:`GraphABC`. +This way all main classes/functions implemented in :mod:`pathcensus` +can automatically interpret them as graph-like objects allowing +seemless integration with many different data formats and third-party +packages such as :py:mod:`networkx` or :py:mod:`igraph`. + +In order to register a class a function for converting its instances to +:py:class:`scipy.sparse.spmatrix` (CRS format) needs to be defined. +The conversion is handled by the :py:func:`pathcensus.graph.adjacency` function +which can be overloaded through the single dispatch mechanism. In particular, +it should be called on arrays/sparse matrices extracted from graph classes +to ensure standardized format. See the example below. + +Graph classes defined by :py:mod:`networkx`, :py:mod:`igraph` +and :py:mod:`graph_tool` are registered automatically provided +the packages are installed. + +Below is an example in which a custom conversion from a list of list +format is registered. Arguably, the below implementation is naive and +handles the conversion by simply converting to a :py:mod:`numpy` array, +without checking wheter the array is really 2D and square, but it illustrates +the main logic of registering custom graph-like classes. + +.. doctest:: graph-abc + + >>> import numpy as np + >>> from pathcensus import GraphABC, adjacency, PathCensus + + >>> def _adj_list(graph: list) -> spmatrix: + ... \"\"\"Adjacency matrix from list of lists.\"\"\" + ... A = np.array(graph) + ... return adjacency(A) + + >>> GraphABC.register_type(list, _adj_list) + >>> # A triangle graph + >>> A = [[0, 1, 1], [1, 0, 1], [1, 1, 0]] + >>> # Calculate path census + >>> paths = PathCensus(A) + >>> paths.census("global") + t tw th q0 qw qh + 0 1 3 3 0 0 0 +""" +from abc import ABC +from typing import Callable, Any +from functools import singledispatch +import numpy as np +from scipy.sparse import spmatrix, csr_matrix + + +class GraphABC(ABC): + """Abstract Base Class (ABC) for registering different graphs classes. + + Any kind of graph object from different libraries can be registered + as a subclass of ABC as long as also a function for converting it into + a sparse adjacency matrix is provided at the registration time. + In particular, :py:class:`scipy.sparse.spmatrix` objects are + automatically recognized as graph-like objects. + + This allows all graph-based functions/methods/class defined in + :py:mod:`pathcensus` to operate flexibly on any sort of graph-like + objects / graph implementations. + + Provided the packages are installed methods for handling graph objects + from :py:mod:`networkx`, :py:mod:`igraph` and :py:mod:`graph_tool` + are automatically registered. + + See class method :py:meth:`register_graph` for more info. + """ + @classmethod + def register_type( + cls, + subclass: type, + adj: Callable[..., spmatrix] + ) -> None: + """Register type as a subclass of :py:class:`pathcensus.graph.GraphABC`. + + Parameters + ---------- + subclass + Any graph-like class. + adj + Function for converting `subclass` graphs to sparse adjacency matrices. + It should use the following signature `(graph, **kwds) -> spmatrix`. + The return matrix must be in the format in which `(i, j)` indicate + an edge from `i` to `j` (in the case a network is directed). + Using ``**kwds`` is optional and in general it is best to + implement `adj` in such a way that using ``**kwds`` is not necessary, + in particular for detecting whether a graph is weighted and + converting it to a weighted adjacency matrix if necessary. + This way :py:mod:`pathcensus` will be able automatically choose + to use weighted methods for weighted graphs. + """ + cls.register(subclass) + adjacency.register(adj) + + +@singledispatch +def adjacency(graph: GraphABC) -> spmatrix: + """Get (unweighted) adjacency matrix of a graph.""" + raise TypeError(f"cannot handle '{type(graph)}' object") + + +# Register 2D square numpy arrays as graph-like class ------------------------- + +def _adj_numpy(graph: np.ndarray) -> spmatrix: + """Convert 2D square array to sparse matrix.""" + if graph.ndim != 2: + raise AttributeError("only 2D arrays are accepted") + if graph.shape[0] != graph.shape[1]: + raise AttributeError("array is not square") + i, j = graph.nonzero() + data = graph[i, j] + adj = csr_matrix((data, (i, j)), shape=graph.shape, dtype=graph.dtype) + return adj + +GraphABC.register_type(spmatrix, _adj_numpy) + +# Register sparse matrices as graph-like class -------------------------------- + +def _adj_spmat(graph: spmatrix) -> spmatrix: + """Adjacency matrix from sparse matrix. + + It just converts it to CSR format and ensures that no zeros + are represented explicitly. + """ + graph = graph.tocsr() + graph.eliminate_zeros() + return graph + +GraphABC.register_type(spmatrix, _adj_spmat) + + +# Register networkx networks as graph-like class ------------------------------ +try: + import networkx as nx # tyoe: ignore + def _adj_nx(graph: nx.Graph, **kwds: Any) -> spmatrix: + """Adjacency matrix from :py:class:`networkx.Graph` object.""" + adj = nx.convert_matrix.to_scipy_sparse_matrix(graph, **kwds) + return adjacency(adj) + # Register as GraphABC subclass + GraphABC.register_type(nx.Graph, _adj_nx) +except ModuleNotFoundError: + pass + + +# Register igraph networks as graph-like class -------------------------------- +try: + import igraph as ig # type: ignore + def _adj_ig(graph: ig.Graph, **kwds: Any) -> spmatrix: + """Adjacency matrix from :py:class:`igraph.Graph` object.""" + if graph.is_weighted(): + attribute = "weight" + else: + attribute = None + kwds = { "attribute": attribute, **kwds } + adj = graph.get_adjacency_sparse(**kwds) + + if kwds["attribute"] is None: + adj.data[:] = 1 + + return adjacency(adj) + # Register as GraphABC subclass + GraphABC.register_type(ig.Graph, _adj_ig) +except ModuleNotFoundError: + pass + + +# Register graph_tool networks as graph-like class ---------------------------- +try: + # pylint: disable=import-error + import graph_tool.all as gt # type: ignore + def _adj_gt(graph: gt.Graph, **kwds: Any) -> spmatrix: + """Adjacency matrix from :py:class:`graph_tool.Graph` object.""" + if "weight" in graph.edge_properties: + weight = graph.edge_properties["weight"] + else: + weight = None + kwds = { "weight": weight, **kwds } + adj = gt.adjacency(graph, **kwds).T + + if kwds["weight"] is None: + adj.data[:] = 1 + + return adjacency(adj) + # Register as GraphABC subclass + GraphABC.register_type(gt.Graph, _adj_gt) +except ModuleNotFoundError: + pass diff --git a/pathcensus/inference.py b/pathcensus/inference.py new file mode 100644 index 0000000..404fcc2 --- /dev/null +++ b/pathcensus/inference.py @@ -0,0 +1,725 @@ +"""Approximate inference for arbitrary graph statistics, +including structural coefficients, can be conducted using +samples from appropriate Exponential Random Graph Models. +The following generic algorithm can be used to solve a wide +range of inferential problems: + +#. Calculate statistics of interest on an observed graph. +#. Sample ``R`` randomized instances from an appropriate null model. +#. Calculate graph statistics on null model samples. +#. Compare observed and null model values. + +:class:`Inference` class implements the above approach. +It is comaptible with any registered class of graph-like objects +and any properly implemented subclass of +:class:`pathcensus.nullmodels.base.ERGM` representing a null model +to sample from. + +.. seealso:: + + :mod:`pathcensus.graph` for seemless ``pathcensus`` integration + with arbitrary graph-like classes. + + :mod:`pathcensus.nullmodels` for available null models. + +This simulation-based approach is relatively efficient for graph- +and node-level statistics but can be very computationally expensive +when used for edge-level analyses. Hence, is this case it is often +useful to use various coarse-graining strategies to reduce the number +of unique combinations of values of sufficient statistics. + +.. seealso:: + + :class:`pathcensus.graph.GraphABC` for the abstract class + for graph-like objects. + + :mod:`pathcensus.nullmodels` for compatible ERGM classes. + + :meth:`pathcensus.inference.Inference.coarse_grain` + for coarse-graining methods. + +Below is a simple example of an estimation of p-values of node-wise +structural similarity coefficients in an Erdős–Rényi random graph. +The result, of course, should not be statistically significant. +We use the default significance level of :math:`\\alpha = 0.05` and +Benjamini-Hochberg FDR correction for multiple testing. + +.. testsetup:: inference + + import numpy as np + np.random.seed(34) + +.. doctest:: inference + + >>> import numpy as np + >>> from scipy import sparse as sp + >>> from pathcensus import PathCensus + >>> from pathcensus.inference import Inference + >>> from pathcensus.nullmodels import UBCM + >>> np.random.seed(34) + >>> # Generate ER random graph (roughly) + >>> A = sp.random(100, 100, density=0.05, dtype=int, data_rvs=lambda n: np.ones(n)) + >>> A = (A + A.T).astype(bool).astype(int) + >>> ubcm = UBCM(A) + >>> err = ubcm.fit() + >>> infer = Inference(A, ubcm, lambda g: PathCensus(g).similarity()) + >>> data, null = infer.init_comparison(100) + >>> pvals = infer.estimate_pvalues(data, null, alternative="greater") + >>> # Structural similarity coefficient values + >>> # should not be significant more often than 5% of times + >>> # (BH FDR correction is used) + >>> (pvals <= 0.05).mean() <= 0.05 + True +""" +from __future__ import annotations +from typing import Any, Optional, Mapping +from typing import Sequence, Tuple, Dict, Callable, Literal, Union +from dataclasses import dataclass +import numpy as np +import numba +import pandas as pd +from statsmodels.stats.multitest import fdrcorrection_twostage +from tqdm.auto import tqdm +from .nullmodels.base import ERGM +from .types import GraphABC, Data + + +class Inference: + """Generic approximate statistical inference based on + arbitrary null models with node-level sufficient statistics. + + The methods implemented by this class are based on sampling + from null models so they are may not be very efficient, in particular + for edge-level statistics. On the other hand, they allow to conduct + statistical inferency for arbitrary graph statistics. + + Attributes + ---------- + graph + Graph-like object representing an observed network. + model + Fitted instance of a subclass of + :py:class:`pathcensus.nullmodels.ERGM`. + statistics + Function for calculating graph statistics of interest with the + following signature:: + + (graph, **kwds) -> DataFrame / Series + + The first argument must be a graph-like object (e.g. a sparse matrix), + ``**kwds`` can be used to pass additional arguments + (only keyword args are allowed) if necessary. + The return value must be either a :py:class:`pandas.DataFrame` + or :py:class:`pandas.Series`. + aggregate_by + Mode of aggregation for determining null distribution. + If ``"stats"`` then null distribution is aggregated within + unique combinations of values of the sufficient statistics + (possibly coarse-grained, see :py:meth:`init_comparison`). + If ``"units"`` then null distribution is aggregated within individual + units (e.g. nodes). This is often useful in analyses at the level + of nodes but may require too many samples for edge-level analyses. + """ + _alternative = ("greater", "less") + _aggregate_by = ("stats", "units") + _filter_index = ("values", "range") + index_names = ("i", "j") + + @dataclass + class Levels: + """Container class for storing information on unit, + sufficient statistics and other index levels in observed + and null model data. + """ + units: Tuple[str, ...] + stats: Tuple[str, ...] + other: Tuple[str, ...] + + def __bool__(self) -> bool: + return bool(self.units or self.stats or self.other) + + def __init__( + self, + graph: GraphABC, + model: ERGM, + statistics: Callable[[GraphABC], Data], + *, + aggregate_by: Literal[_aggregate_by] = _aggregate_by[0] # type: ignore + ) -> None: + """Initialization method.""" + self.graph = graph + self.model = model + self.statistics = statistics + self.aggregate_by = self._check_vals( + aggregate_by=aggregate_by, + allowed=self._aggregate_by + ) + + def __call__( + self, + graph: GraphABC, + _stats: Optional[np.ndarray] = None, + **kwds: Any + ) -> Data: + """This method should be called to actually calculate graph statistics. + + Parameters + ---------- + graph + Graph-like object to calculate statistics for. + stats + Array of sufficient statistics for nodes. + If ``None`` then `self.model.statistics` is used. + **kwds + Passed to graph statistics function. + """ + data = self.statistics(graph, **kwds) + if _stats is None: + _stats = self.model.extract_statistics(graph) + # _stats = self.model.statistics + out = self._postprocess_data(data, _stats) + return out + + @property + def n_nodes(self) -> int: + """Number of nodes in the observed network.""" + return self.model.n_nodes + + def get_levels(self, data: Data) -> Levels: + """Get index levels descriptor from a data object.""" + nodes = tuple(l for l in data.index.names if l in self.index_names) + allowed_stats = [ f"{l}{i}" for l in self.model.labels for i in nodes ] + stats = tuple(l for l in data.index.names if l in allowed_stats) + other = tuple( + l for l in data.index.names + if l and l not in (*nodes, *stats) + ) + return self.Levels(nodes, stats, other) + + def simulate_null( + self, + n: int, + *, + progress: bool = False, + progress_kws: Optional[Dict] = None, + use_observed_stats: bool = True, + **kwds: Any + ) -> Data: + """Get data frame of null model samples of strucutral coefficients. + + Parameters + ---------- + n + Number of samples. + progress + Should progress bar be showed. + progress_kws + Keyword arguments for customizing progress bar when + ``progress=True``. Passed to :py:func:`tqdm.tqdm`. + use_observed_stats + If ``True`` then simulated data is indexed with + sufficient statistics from the observed network. + This often helps to accumulate enough observations + faster at the expense of not fully exact conditioning. + **kwds + Keyword arguments passed to :py:meth:`statistics`. + + Returns + ------- + null + Data frame with simulated null distribution. + """ + rand = [] + keys = [] + + simulator = self.model.sample(n) + progress_kws = { + **(progress_kws or {}), + "disable": not progress, + "total": n + } + + for i, graph in tqdm(enumerate(simulator), **progress_kws): + keys.append(i) + _stats = self.model.statistics if use_observed_stats else None + rand.append(self(graph, _stats=_stats, **kwds)) + + null = pd.concat(rand, keys=keys, names=["_sample"]) + + return null + + def filter_index( + self, + data: Data, + target: Data, + *, + how: Literal[_filter_index] = _filter_index[0], # type: ignore + levels: Optional[Union[Sequence[str], Sequence[int]]] = None + ) -> Data: + """Filter ``data`` by index with respect to ``target``. + + Parameters + ---------- + data + Data to filter. + target + Dataset with target index. + how + How index should be filtered. + Either by unique combinations of values or just contained + to the range of values for separate levels in ``target``. + levels + Levels to use for filtering. If ``None`` then either + ``self.levels.units`` or ``self.levels.stats`` is used depending + on the value of ``self.aggregate_by``. + + Returns + ------- + data + Filtered copy of ``data``. + """ + how = self._check_vals(how=how, allowed=self._filter_index) + + l = self.get_levels(target) + if levels is None: + levels = l.stats if self.aggregate_by == "stats" else l.units + + # Filter by unique combinations of values in `target` + if how == "values": + # Determinex index values in `data` + remove = [ n for n in data.index.names if n not in levels ] + didx = data.reset_index(remove).index if remove else data.index + # Determine index values in `target` + remove = [n for n in target.index.names if n not in levels ] + tidx = target.reset_index(remove).index if remove else target.index + data = data[didx.isin(tidx)] + # Filter down to ranges of levels in `target` + else: + for level in levels: + tidx = target.index.get_level_values(level).values + didx = data.index.get_level_values(level).values + minx = min(tidx) + maxx = max(tidx) + data = data[(didx >= minx) & (didx <= maxx)] + + return data + + def init_comparison( + self, + n: int, + *, + filter_index: Union[bool, Literal[_filter_index]] = False, # type: ignore + sample_index: bool = False, + null_kws: Optional[Dict] = None, + **kwds: Any + ) -> Tuple[Data, Data, Levels]: + """Initialize data for a comparison with a null model + and determine index level names. + + Parameters + ---------- + n + Number of null model samples. + filter_index + If ``True`` or ``"values"`` then ``null`` will be filtered to + contain only observations with index values matching those in + ``data`` with levels used for the comparison selected based on + ``self.aggregate_by``. If ``"range"`` then null model samples + will be filtered to be in the range of index values in the + observed data. + sample_index + If ``False`` then ``_sample`` index with sample ids + is dropped from ``null`` data frame with null model samples. + null_kws: + Keyword args passed to :py:meth:`simulate_null`. + **kwds + Passed to :py:meth:`statistics` method used for calculating + statistics of interest. + + Notes + ----- + Estimating distributions of edge-wise statistics conditional + on sufficient statistics of the participating nodes may + require really large number of samples and in general is + not really feasible for large networks (in particular weighted). + The same applies, although probably to slightly lesser degree, + to node-wise statistics when ``use_observed_stats=False`` + is passed to :py:meth:`simulate_null`. + + Efficient methods for solving these problems will be implemented + in the future. + + Returns + ------- + data + Observed graph statistics. + null + Null distribution samples. + """ + if isinstance(filter_index, bool): + if filter_index: + filter_index = "range" + else: + filter_index = self._check_vals( + filter_index=filter_index, + allowed=self._filter_index + ) + + data = self(self.graph, **kwds) + self._validate_data(data) + + null_kws = (null_kws or {}) + null = self.simulate_null(n, **(null_kws or {}), **kwds) + + levels = self.get_levels(data) + remove = levels.units if self.aggregate_by == "stats" else levels.stats + remove = [ *remove, *levels.other ] + if remove: + null.reset_index(remove, drop=True, inplace=True) + + data = pd.concat([data], keys=[0], names=["_"]) + null = pd.concat([null], keys=[0], names=["_"]) + + data = self._remove_unnamed_indexes(data) + null = self._remove_unnamed_indexes(null) + + if not sample_index: + null.reset_index("_sample", drop=True, inplace=True) + + if filter_index: + null = self.filter_index(null, data, how=filter_index) + + return data, null + + def postprocess(self, data: Data, target: Data) -> Data: + """Postprocess data after running a comparison. + + This mainly involves sanitizing index names after aggregation + as well as setting proper shape and types for outputs + so ``data`` has the same general form as ``target``. + """ + if isinstance(data, pd.Series) and isinstance(target, pd.DataFrame): + data = data.to_frame().T + data.index = target.index + if isinstance(data, (pd.Series, pd.DataFrame)): + data = self.postprocess_index(data) + return data + + def postprocess_index(self, data: Data) -> Data: + """Postprocess index after running a comparison. + + This involves getting rid of temporary index names + used when running comparisons as well as any unnamed indexes. + Moreover sufficient statistics indexes are removed + if ``self.aggregate_by == "unit"`` or ensuring that observed values of + sufficient statistics are used in the index + (instead of coarse-grained values) if ``self.aggregate == "stats"``. + """ + levels = self.get_levels(data) + # Remove sufficient statistics indexes + remove = [ l for l in levels.stats if l in data.index.names ] + # Remove auxiliary index `_` + if "_" in data.index.names: + remove.append("_") + if remove: + data.reset_index(remove, drop=True, inplace=True) + # Remove unnamed indexes + remove = [ i for i, n in enumerate(data.index.names) if n is None ] + if remove: + data.reset_index(remove, drop=True, inplace=True) + # Add indexes if necessary + if self.aggregate_by == "stats": + data = self.add_stats_index(data, self.model.statistics) + return data + + def estimate_pvalues( + self, + data: pd.DataFrame, + null: pd.DataFrame, + *, + alternative: Literal[_alternative] = _alternative[0], # type: ignore + adjust: bool = True, + resolution: int = 1000, + **kwds: Any + ) -> Data: + """Estimate p-values of node/edge/global coefficients + based on sampling from a configuraiton model + (as returned by :py:meth:`init_comparison`). + + Parameters + ---------- + data + Data frame with observed graph statistics. + null + Data frame with simulated null distribution + of graph statistics. + alternative + Type of test two perform. + Currently only one-sided tests are supported. + adjust + Should p-values be adjusted. Benjamini-Hochberg FDR correction is + used by default when ``True``. + resolution + Resolution of p-value estimation. It specifies the number + of quantiles to comapre observed values against. + For instance, if ``resolution=100`` then p-values + will be accurate only up to ``0.01``. + This parameter controls the amount of memory consumed + by the estimation process. + **kwds + Passed as additional arguents to :meth:`adjust_pvalues` + when ``adjust=True``. + + See Also + -------- + adjust_pvalues : p-value adjustment method + + Returns + ------- + pvalues + P-values for statistics as :py:class:`pandas.Series` + (for one graph statistic) or :py:class:`pandas.DataFrame` + (for multiple statistics). + """ + alternative = self._check_vals( + alternative=alternative, + allowed=self._alternative + ) + levels = self.get_levels(data) + + if levels.units: + null = self.filter_index(null, data, how="values") + + # Quantile data frame + if self.aggregate_by == "units": + keys = levels.units + else: + keys = levels.stats + + if keys: + qdf = null.groupby(level=keys) + else: + qdf = null + + qdf = qdf.quantile(np.arange(0, resolution+1) / resolution) \ + .reset_index(level=-1, drop=True) + + idx = qdf.index.to_frame().rename(columns={0: None}) + idx.insert(0, "_", 0) + idx = pd.MultiIndex.from_frame(idx) + qdf.index = idx + + # Estimate p-values + if alternative == "greater": + pvals = data.le(qdf) + else: + pvals = data.ge(qdf) + + pvals = pvals.fillna(True) + if levels.units: + pvals = pvals.groupby(level=levels.units) + + pvals = pvals.mean() + + if adjust and levels.units: + adjust_kws = { **kwds, "copy": False } + pvals = self.adjust_pvalues(pvals, **adjust_kws) + + return self.postprocess(pvals, target=data) + + @staticmethod + def adjust_pvalues( + pvals: Data, + *, + alpha: float = 0.05, + copy: True = bool, + **kwds: Any + ) -> Data: + """Adjust p-values for multiple testing. + + Benjamini-Hochberg-Yekuteli two-stage procedure implemented in + :py:func:`statsmodels.multitest.fdrcorrection_twostage` + is used. + + Parameters + ---------- + pvals + Data frame / series with p-values for different coefficients + in columns. + alpha + Desired type I error rate after the adjustement. + copy + Should copy of ``pvals`` be returned. + **kwds + Additional arguments passed to + :py:func:`statsmodels.multitest.fdrcorrection_twostage` + """ + if copy: + pvals = pvals.copy() + shape = pvals.shape + pv = pvals.values.flatten() + _, pv, *_ = fdrcorrection_twostage(pv, alpha=alpha, **kwds) + pvals.values[:] = np.clip(pv.reshape(shape), 0, 1) + return pvals + + @staticmethod + def add_index( + data: Data, + idx: Mapping[str, Sequence], + *, + prepend: bool = False, + drop_unnamed: bool = True, + copy: bool = True + ) -> Data: + """Add index to a data frame or series. + + Parameters + ---------- + data + Data frame or series. + idx + Mapping from index names to sequences of values. + prepend + Should new indexes be prepended or appended + to the existing indexes. + drop_unnamed + Should unnamed indexes be droppped during the process. + Unnamed indexes are usually generic indexes which are + redundant after adding additional indexes. + copy + Should a copy be returned. + """ + idx = pd.DataFrame(idx) + + if copy: + data = data.copy() + + idf = data.index.to_frame(index=False) + + if drop_unnamed: + use = [ i for i, n in enumerate(data.index.names) if n is not None ] + idf = idf.iloc[:, use] + + objs = [ idx, idf ] if prepend else [ idf, idx ] + objs = [ d for d in objs if not d.empty ] + if objs: + idf = pd.concat(objs, axis=1, ignore_index=False) + else: + # Return if there no resulting indexes + return data.reset_index(drop=True) + + if len(idf) == 0: + return data + if len(idf) == 1: + idx = pd.Index(idf.iloc[:, 0]) + else: + idx = pd.MultiIndex.from_frame(idf) + + data.index = idx + return data + + def add_stats_index( + self, + data: Data, + stats: Optional[np.ndarray] = None + ) -> Data: + """Add indexes with sufficient statistics. + + Parameters + ---------- + data + Data frame or series with graph statistics. + stats + Array of sufficient statistics. + Use ``self.model.statistics`` if ``None``. + """ + idx = {} + levels = self.get_levels(data) + if stats is None: + stats = self.model.statistics + for u in levels.units: + for i, l in enumerate(self.model.labels): + vals = stats[data.index.get_level_values(u), i] + name = f"{l}{u}" + idx[name] = vals + return self.add_index( + data=data, + idx=idx, + prepend=False, + drop_unnamed=True, + copy=False + ) + + # Internals --------------------------------------------------------------- + + def _postprocess_data( + self, + data: Data, + stats: np.ndarray + ) -> Data: + """Post-process data with calculated graph statistics. + + Parameters + ---------- + data + Calculate graph statistics. + stats + Array with sufficient statistics for nodes. + """ + if np.isscalar(data): + data = pd.Series([data]) + + if stats.ndim == 1: + stats = stats[:, None] + + if self.aggregate_by == "stats": + data = self.add_stats_index(data, stats) + + return data + + def _remove_unnamed_indexes( + self, + data: Data + ) -> Data: + """Remove unnamed indexes.""" + remove = [] + for i, name in enumerate(data.index.names): + if name is None: + remove.append(i) + return data.reset_index(level=remove, drop=True) + + def _check_vals( + self, + *, + allowed: Sequence[str], + **kwds: Any + ) -> str: + """Check if value is okay and return.""" + if len(kwds) != 1: + raise ValueError("exactly two keyword arguments are expected") + allowed = tuple(allowed) + key = list(kwds.keys())[0] + val = list(kwds.values())[0] + if val not in allowed: + raise ValueError(f"'{key}' has to be one of {allowed}") + return val + + def _validate_data(self, data: Data) -> None: + """Check if `data` has correct indexes and shape.""" + if not isinstance(data, (pd.Series, pd.DataFrame)): + m = "'data' has to be either 'Series' or 'DataFrame' instance" + raise TypeError(m) + + levels = self.get_levels(data) + + if levels.stats and not levels.units: + raise AttributeError( + "'data' has sufficient statistics " + "but no node/edge indexes" + ) + + if self.aggregate_by == "stats" and levels.units and not levels.stats: + raise AttributeError( + "'data' has node/edge indexes but no " + "index with sufficient statistics" + ) diff --git a/pathcensus/nullmodels/__init__.py b/pathcensus/nullmodels/__init__.py new file mode 100644 index 0000000..7ebe782 --- /dev/null +++ b/pathcensus/nullmodels/__init__.py @@ -0,0 +1,8 @@ +"""Null model classes implementing different variants +of the configuration model. + +The classes implemented in this module are simple wrappers +around :py:mod:`NEMtropy` package. +""" +from .ubcm import UBCM +from .uecm import UECM diff --git a/pathcensus/nullmodels/base.py b/pathcensus/nullmodels/base.py new file mode 100644 index 0000000..a2d6780 --- /dev/null +++ b/pathcensus/nullmodels/base.py @@ -0,0 +1,814 @@ +"""Exponential Random Graph Models (ERGM) with local constraints are +such ERGMs in which sufficient statistics are defined at the level of +individual nodes (or globally for the entire graph). In other words, their +values for each node can be set independently. Unlike ERGMs with non-local +constraints which are notoriously problematic +(e.g. due to degenerate convergence and non-projectivity) +they are analytically solvable. Prime examples of ERGMs with local constraints +are configuration models which induce maximum entropy distributions over +graphs with ``N`` nodes with arbitrary expected degree sequence and/or +strength sequence constraints. + +The :py:mod:`pathcensus.nullmodels` submodule implements several such +ERGMs which are most appropriate for statistical calibration of strucutral +coefficients. They can be applied to simple undirected and unweighted/weighted +networks. + +See Also +-------- +ERGM : base class for ERGMs +pathcensus.nullmodels.ubcm : Undirected Binary Configuration Model + (fixed expected degree sequence) +pathcensus.nullmodels.uecm : Undirected Enhanced Configuration Model + (fixed expected degree and strength sequences assuming positive integer weights) + + +.. note:: + The ERGM functionalities provided by :py:mod:`pathcensus` are simple + wrappers around the :py:mod:`NEMtropy` package. +""" +# pylint: disable=abstract-method +from typing import Any, Union, Mapping, Optional +from typing import Literal, Callable, Iterable, Tuple +from types import MappingProxyType +import io +import contextlib +import warnings +from functools import cached_property +import numpy as np +from numba import njit +from scipy.sparse import spmatrix, csr_matrix +from scipy.sparse.linalg import LinearOperator +from NEMtropy import UndirectedGraph, DirectedGraph +from ..types import GraphABC +from ..utils import relerr + + +# Base ERGM ------------------------------------------------------------------- + +class ERGM: + """Generic base class for Exponential Random Graph Models + with local (i.e. node-level) constraints. + + Attributes + ---------- + statistics + 2D (float) array with sufficient statistics for nodes. + First axis is for nodes and second for differen statistics. + fit_args + Dictionary with arguments used in the last call of :py:meth:`fit`. + ``None`` if the model has not been fitted yet. + + Notes + ----- + The following class attributes are required and need to be defined on + concrete subclasses. + + names + Mapping from names of sufficient statistics to attribute + names in the :py:mod:`NEMtropy` solver class storing fitted + model parameters. They must be provided in an order consistent with + ``statistics``. This is a class attribute which must be defined on + subclasses implementing particular models. The mapping must have + stable order (starting from ``python3.6`` an ordinary ``dict`` will do). + However, it is usually better to use mapping proxy objects instead + of dicts as they are not mutable. + labels + Mapping from abbreviated labels to full names of sufficient statistics. + models + Model names as defined in :py:mod:`NEMtropy` allowed for the specific + type of model. Must be implemented on a subclass as a class attribute. + The first model on the list should will be used by default. + """ + # pylint: disable=too-many-public-methods,function-redefined + names = None + aliases = None + models = None + # Default maximum allowed relative error for validation + default_rtol = 1e-1 + # Default fit methods kwds + default_fit_kwds = None + # Solver methods + methods = ("auto", "newton", "fixed-point") + # Allowed values of 'which' argument in `_get_stat` + _stat_which = ("observed", "expected", "parameters") + + def __init__( + self, + statistics: Union[np.ndarray, GraphABC], + **kwds: Any + ) -> None: + """Initialization method. + + Parameters + ---------- + statistics + Array with sufficient statistics or a graph-like object + (registered properly with :py:class:`pathcensus.types.GraphABC`). + **kwds + Passed to :py:meth:`extract_statistics` when `statistics` is + passed a graph-like object. + """ + if isinstance(statistics, GraphABC): + statistics = self.extract_statistics(statistics, **kwds) + + statistics = statistics.astype(float) + if statistics.ndim == 1: + statistics = statistics.reshape(-1, 1) + + self.validate_statistics_shape(statistics) + self.validate_statistics_values(statistics) + + self.statistics = statistics + self.fit_args = {} + + # Properties -------------------------------------------------------------- + + @property + def fullname(self) -> str: + """Full name of model. May be reimplemented on concrete + subclass to allow using shortened class names. + """ + return self.__class__.__name__ + + @property + def models(self) -> Tuple[str]: + if not self.models: + cn = self.__class__.__name__ + raise NotImplementedError( + f"it seems '{cn}' does not define allowed models" + ) + return tuple(self.models) + + @property + def default_model(self) -> str: + return self.models[0] + + @property + def n_nodes(self) -> int: + """Number of nodes in the underlying graph.""" + return len(self.statistics) + + @property + def n_stats(self) -> int: + """Number of sufficient statistics.""" + return len(self.names) + + @property + def directed(self) -> bool: + """Is model directed.""" + raise NotImplementedError + + @property + def weighted(self) -> bool: + """Is model weighted.""" + raise NotImplementedError + + @property + def expected_statistics(self) -> np.ndarray: + """Model-based expected values of sufficient statistics.""" + raise NotImplementedError + + @property + def names(self) -> Mapping: + """Mapping from names to :py:mod:`NEMtropy` solver attribute names + corresponding to sufficient statistics. + """ + names = self.__class__.names + if not names: + cn = self.__class__.__name__ + raise NotImplementedError( + f"it seems that '{cn}' does not define any names" + ) + if isinstance(names, Mapping): + return dict(names) + return dict(list(names)) + + @property + def labels(self) -> Mapping: + """Mapping from short labels to full names corresponding to sufficient + statistics. + """ + if not self.aliases: + cn = self.__class__.__name__ + raise NotImplementedError( + f"it seems '{cn}' does not define any aliases" + ) + return { self.aliases[n]: n for n in self.names } + + @property + def fp_threshold(self) -> int: + """Threshold on the number of nodes after which by default the + fixed-point solver is used instead of the Newton method solver. + """ + return 500 + + @cached_property + def solver(self) -> Union[UndirectedGraph, DirectedGraph]: + """:py:mod:`NEMtropy` graph solver instance.""" + return self.get_nemtropy_graph() + + @property + def pijfunc(self) -> Callable: + """JIT-compiled function calculating :math:`p_{ij}`'s + based on the model. + """ + raise NotImplementedError + + @property + def wijfunc(self) -> Callable: + """JIT-compiled function sampling edge weights :math:`w_{ij}` + based on the model. + """ + self._only_weighted() + raise NotImplementedError + + @property + def Ewijfunc(self) -> Callable: + """JIT-compiled function calculating expected edge weights + :math:`\\mathbb{E}[w_{ij}]` (conditional on being present) + based on the model. + """ + self._only_weighted() + raise NotImplementedError + + @property + def pmv(self) -> Callable: + """JIT-compiled function calculating :math:`Pv` + where :math:`P` is the edge probability matrix + and :math:`v` is an arbitrary vector. + """ + return lambda v: get_pmv(self.X, v, self.pijfunc) + + @property + def rpmv(self) -> Callable: + """JIT-compiled function calculating :math:`vP` + where :math:`P` is the edge probability matrix + and :math:`v` is an abitrary vector. + """ + if self.directed: + raise NotImplementedError + return self.pmv + + @property + def wmv(self) -> Callable: + """JIT-compiled function calculating :math:`Wv` + where :math:`W` is the matrix of expected edge weights + and :math:`v` is an arbitrary vector. + """ + self._only_weighted() + return lambda v: get_wmv(self.X, v, self.pijfunc, self.Ewijfunc) + + @property + def rwmv(self) -> Callable: + """JIT-compiled function calculating :math:`vW` + where :math:`W` is the matrix of expected edge weights + and :math:`v` is an arbitrary vector. + """ + if self.directed: + raise NotImplementedError + return self.wmv + + # Parameters properties and getters --------------------------------------- + + def get_stat( + self, + stat: Union[int, str], + expected: bool = False + ) -> np.ndarray: + """Get sufficient statistic array by index or label. + + Parameters + ---------- + stat + Index or label of a sufficient statistic. + expected + Should observed or expected statistic be returned. + """ + which = "expected" if expected else "observed" + return self._get_stat(stat, which=which) + + def get_param(self, stat: Union[int, str]) -> np.ndarray: + """Get parameter array associated with a given sufficient statistic. + + ``None`` is returned if the model is not yet fitted. + + Parameters + ---------- + stat + Index or label of a sufficient statistic. + """ + return self._get_stat(stat, which="parameters") + + @property + def X(self) -> Optional[np.ndarray]: + """Array with fitted model parameters (1D). + + Raises + ------ + ValueError + If model is not fitted. + """ + self.check_fitted() + return np.concatenate([ + getattr(self.solver, attr) for attr in self.names.values() + ]) + + @property + def parameters(self) -> Optional[np.ndarray]: + """Array with fitted model parameters shaped as ``self.statistics``. + + Raises + ------ + ValueError + If model is not fitted. + """ + return self._get_param_array(self.X) + + @property + def error(self) -> np.ndarray: + """Get maximum overall absolute error of the fit.""" + self.check_fitted() + return self.solver.error + + def get_P( + self, + *, + dense: bool = False + ) -> Union[LinearOperator, np.ndarray]: + """Get matrix of edge probabilities. + + Parameters + ---------- + dense + If ``True`` then a dense array is returned. + Otherwise a :py:class:`scipy.sparse.linalg.LinearOperator` + is returned. + """ + n = self.n_nodes + P = LinearOperator( + shape=(n, n), + matvec=self.pmv, + rmatvec=self.rpmv, + dtype=self.X.dtype + ) + if dense: + P = P@np.eye(n) + return P + + def get_W( + self, + *, + dense: bool = False + ) -> Union[LinearOperator, np.ndarray]: + """Get matrix of expected edge weights. + + Parameters + ---------- + dense + If ``True`` then a dense array is returned. + Otherwise a :py:class:`scipy.sparse.linalg.LinearOperator` + is returned. + + Raises + ------ + NotImplementedError + If called on a model instance which is not weighted. + """ + self._only_weighted() + n = self.n_nodes + P = LinearOperator( + shape=(n, n), + matvec=self.wmv, + rmatvec=self.rwmv, + dtype=self.X.dtype + ) + if dense: + P = P@np.eye(n) + return P + + + # Validation methods ------------------------------------------------------ + + def validate_statistics_shape(self, statistics: np.ndarray) -> None: + """Raise ``ValueError`` if ``statistics`` has an incorrect shape + which is not consistent with the class attribute ``cls.names``. + """ + if statistics.ndim != 2: + raise ValueError("'statistics' array does not have two axes") + ncol = statistics.shape[1] + if ncol != self.n_stats: + cnm = self.__class__.__name__ + raise ValueError( + f"'statistics' array has {ncol} columns while " + f"'{cnm}' class defines {self.n_stats} sufficient statistics" + ) + + def validate_statistics_values(self, statistics: np.ndarray) -> None: + """Raise if ``statistics`` contain incorrect values. + + It must be implemented on a subclass. + + Notes + ----- + Validation of the shape of ``statistics`` is implemented + independently in :py:meth:`validate_statistics_shape` + which is a generic method which in most cases does not need + to be implemented on subclasses. + """ + raise NotImplementedError + + def relerr(self) -> np.ndarray: + """Get error of the fitted expected statistics relative + to the observed sufficient statistics as + ``|expected - observed| / |observed|``. + """ + self.check_fitted() + return relerr(self.expected_statistics, self.statistics) + + def is_valid(self, rtol: Optional[float] = None) -> bool: + """Check if model is approximately correct or that the relative + difference ``|expected - observed| / |observed|`` is not greater + than ``rtol``. + + Parameters + ---------- + rtol + Maximum allowed relative difference. + Class attribute ``default_rtol`` is used when ``None``. + """ + rtol = self.default_rtol if rtol is None else rtol + return self.relerr().max() <= rtol + + def validate(self, rtol: Optional[float] = None) -> None: + """Raise ``ValueError`` if the relative difference + ``|expected - observed| / |observed|``, is greater than ``rtol``. + + Parameters + ---------- + rtol + Maximum allowed relative difference. + Class attribute ``default_rtol`` is used when ``None``. + + Returns + ------- + self + The same model instance if the error is not raised. + """ + rtol = self.default_rtol if rtol is None else rtol + e = self.relerr().max() + is_valid = e <= rtol + if not is_valid: + raise ValueError( + f"maximum relative error, {e}, is greater than {rtol}" + ) + return self + + def is_fitted(self) -> bool: + """Check if model instance is fitted + (this does not check quality of the fit). + """ + return self.solver.x is not None + + def check_fitted(self) -> None: + """Raise `ValueError` if model is not fitted.""" + if not self.is_fitted(): + raise ValueError("model is not fitted; use 'fit' method") + + # Statistics getter methods ----------------------------------------------- + + def extract_statistics(self, graph: GraphABC) -> np.ndarray: + """Extract array of sufficient statistics from a graph-like object.""" + raise NotImplementedError + + # NEMtropy wrapper methods ------------------------------------------------ + + def get_nemtropy_graph(self) -> Union[UndirectedGraph, DirectedGraph]: + """Get :py:mod:`NEMtropy` graph representation instance + appropriate for a given type of model. + """ + raise NotImplementedError + + def fit( + self, + model: Optional[str] = None, + method: Literal[methods] = methods[0], # type: ignore + **kwds + ) -> float: + """Fit model parameters to the observed sufficient statistics + and returns the overall maximum absolute error. + + Parameters + ---------- + model + Type of model to use. Default value defined in + ``self.default_model`` is used when ``None``. + method + Solver method to use. If ``"auto"`` then either Newton or fixed-point + method is used depending on the number of nodes with the threshold + defined by ``self.fp_threshold``. + **kwds + Passed to NEMtropy solver method ``solve_tool``. + + Notes + ----- + Some of the ``**kwds`` may be prefilled (but can be overriden) + with default values defined on ``default_fit_kwds`` class attribute. + + Returns + ------- + self + Fitted model. + """ + if method not in self.methods: + raise ValueError(f"'method' has to be one of {self.methods}") + if method == "auto": + if self.n_nodes < self.fp_threshold: + method = "newton" + else: + method = "fixed-point" + + model = model or self.default_model + kwds = { **(self.default_fit_kwds or {}), **kwds } + kwds = dict(model=model, method=method, **kwds) + + with \ + warnings.catch_warnings(), \ + contextlib.redirect_stdout(io.StringIO()): + warnings.simplefilter("ignore") + self.solver.solve_tool(**kwds) + self.fit_args = MappingProxyType(kwds) + return self + + def sample_one(self) -> spmatrix: + """Sample a graph instance as sparse matrix from the model. + + Returns + ------- + A + Graph instance represented as a sparse matrix (CSR format). + """ + n = self.n_nodes + if self.weighted: + E, W = sample_edgelist_weighted(self.X, n, self.pijfunc, self.wijfunc) + return self._make_adj(E, W) + E = sample_edgelist_unweighted(self.X, n, self.pijfunc) + return self._make_adj(E) + + def sample(self, n: int) -> Iterable[spmatrix]: + """Generate `n` instances sampled from the model. + + Yields + ------ + A + Graph instance represented as a sparse matrix (CSR format) + """ + for _ in range(n): + yield self.sample_one() + + + # Internals --------------------------------------------------------------- + + def _get_param_array(self, X: np.ndarray) -> np.ndarray: + """Get array of model parameters shaped as ``self.statistics``.""" + return X.reshape(self.n_stats, -1).T + + def _get_stat( + self, + stat: Union[int, str], + *, + which: Literal[_stat_which] = _stat_which[0] # type: ignore + ) -> np.ndarray: + """Get particular sufficient statistic by index or label. + + Parameters + ---------- + stat + Index or label of a statistic to extract. + which + Should observed or expected sufficient statistics be returned + or alternatively their associated model parameters. + """ + if which not in self._stat_which: + raise ValueError(f"'which' has to be one of {self._stat_which}") + + if which == "observed": + statistics = self.statistics + elif which == "expected": + statistics = self.expected_statistics + else: + statistics = self.parameters + + if isinstance(stat, int): + return statistics[:, stat] + if isinstance(stat, str): + for idx, name in enumerate(self.names): + if stat == name: + return statistics[:, idx] + raise ValueError( + f"there is no statistic/parameter with label/index '{stat}'" + ) + + def _make_adj( + self, + E: np.ndarray, + W: Optional[np.ndarray] = None + ) -> spmatrix: + """Make adjacency from edgelist and optional edge weighted array.""" + n = self.n_nodes + if W is None: + W = np.ones(len(E), dtype=np.uint8) + i, j = E.T + A = csr_matrix((W, (i, j)), shape=(n, n)) + + if not self.directed: + A += A.T + + return A + + def _only_weighted(self) -> None: + """Raise if model is not weighted.""" + if not self.weighted: + cn = self.__class__.__name__ + raise AttributeError(f"'{cn}' is not weighted") + +# Soft Configuration Model ---------------------------------------------------- + +class SoftConfigurationModel(ERGM): + """Base class for soft configuration models.""" + def validate_statistics_values(self, statistics: np.ndarray) -> None: + """Raise if degree sequence contains negative values.""" + for i, name in enumerate(self.names): + stat = statistics[:, i] + if np.any(stat <= 0): + raise ValueError(f"{name} sequence contains non-positive values") + +# Undirected Soft Configuration Model ----------------------------------------- + +class UndirectedSoftConfigurationModel(SoftConfigurationModel): + """Base class for undirected soft configuration models.""" + aliases = MappingProxyType({ + "degree": "d", + "strength": "s" + }) + + @property + def directed(self) -> bool: + return False + + +# Compiled routines ----------------------------------------------------------- + +@njit(boundscheck=False, nogil=True, cache=True) +def get_pmv( + X: np.ndarray, + v: np.ndarray, + pijfunc: Callable[[np.ndarray, int, int], float] +) -> np.ndarray: + """Calculate :math:`Pv` where :math:`P` is edge probability matrix + and :math:`v` an arbitrary vector. + + Parameters + ---------- + X + 1D array of model parameters. + v + Arbitrary vector. + pijfunc + JIT-compiled function (in no-python mode) calculating edge + probabilities :math:`p_{ij}`. It should have the following + signature: ``(X, i, j) -> float``, where ``X`` is a 1D array + of model parameters. The return value must be a float in ``[0, 1]``. + """ + v = v.flatten() + u = np.zeros_like(v, dtype=X.dtype) + n = len(v) + + for i in range(n): + for j in range(n): + pij = pijfunc(X, i, j) + u[i] += pij * v[j] + + return u + +@njit(boundscheck=False, nogil=True, cache=True) +def get_wmv( + X: np.ndarray, + v: np.ndarray, + pijfunc: Callable[[np.ndarray, int, int], float], + Ewijfunc: Callable[[np.ndarray, int, int], float] +) -> np.ndarray: + """Calculate :math:`Wv` where :math:`W` is expected edge weight matrix + and :math:`v` is an arbitrary vector. + + Parameters + ---------- + X + 1D array of model parameters. + v + Arbitrary vector. + pijfunc + JIT-compiled function (in no-python mode) calculating edge + probabilities :math:`p_{ij}`. It should have the following + signature: ``(X, i, j) -> float``, where ``X`` is a 1D array + of model parameters. The return value must be a float in ``[0, 1]``. + Ewijfunc + JIT-compiled function (in no-python mode) calculating expected + edge weights :math:`\\mathbb{E}[p_{ij}]`. It should have the + following signature ``(X, i, j) -> float``, where ``X`` is a 1D array + of model parameters. The return value must be a positive float. + """ + v = v.flatten() + u = np.zeros_like(v, dtype=X.dtype) + n = len(v) + + for i in range(n): + for j in range(n): + pij = pijfunc(X, i, j) + wij = Ewijfunc(X, i, j) + u[i] += pij * wij * v[j] + + return u + +@njit(boundscheck=False, nogil=True, cache=True) +def sample_edgelist_unweighted( + X: np.ndarray, + n_nodes: int, + pijfunc: Callable[[np.ndarray, int, int], float] +) -> np.ndarray: + """Sample edgelist array from an ERGM. + + Parameters + ---------- + X + 1D array of model parameters. + n_nodes + Number of nodes in hte underlying graph. + pijfunc + JIT-compiled function (in no-python mode) calculating edge + probabilities :math:`p_{ij}`. It should have the following + signature: ``(X, i, j) -> float``, where ``X`` is a 1D array + of model parameters. The return value must be a float in ``[0, 1]``. + + Returns + ------- + E + Edgelist array. + """ + edges = [] + for i in range(1, n_nodes): + for j in range(i): + pij = pijfunc(X, i, j) + + if np.random.rand() <= pij: + edges.append([i, j]) + + return np.array(edges) + +@njit(boundscheck=False, nogil=True, cache=True) +def sample_edgelist_weighted( + X: np.ndarray, + n_nodes: int, + pijfunc: Callable[[np.ndarray, int, int], float], + wijfunc: Callable[[np.ndarray, int, int], Union[int, float]] +) -> Tuple[np.ndarray, Optional[np.ndarray]]: + """Sample edgelist array from an ERGM. + + Parameters + ---------- + X + 1D array of model parameters. + n_nodes + Number of nodes in the underlying graph. + weighted + Is the model weighted + pijfunc + JIT-compiled function (in no-python mode) calculating edge + probabilities :math:`p_{ij}`. It should have the following + signature: ``(X, i, j) -> float``, where ``X`` is a 1D array + of model parameters. The return value must be a float in ``[0, 1]``. + wijfunc + JIT-compiled function (in no-python mode) sampling edge weights + :math:`w_{ij}`. It should have the following signature: + ``(X, i, j) -> float/int``, where ``X`` is a 1D array of model + arameters. The return value must be a positive int/float. + + Returns + ------- + E + Edgelist array. + W + 1D array with edge weights. + """ + edges = [] + weights = [] + for i in range(1, n_nodes): + for j in range(i): + pij = pijfunc(X, i, j) + + if np.random.rand() <= pij: + edges.append([i, j]) + w = wijfunc(X, i, j) + weights.append(w) + + return np.array(edges), np.array(weights) diff --git a/pathcensus/nullmodels/ubcm.py b/pathcensus/nullmodels/ubcm.py new file mode 100644 index 0000000..b67dbc4 --- /dev/null +++ b/pathcensus/nullmodels/ubcm.py @@ -0,0 +1,151 @@ +"""Undirected Binary Configuration Model (UBCM) induces a maximum entropy +probability distribution over networks of a given size such that it has +a specific expected degree sequence. It can be used to model undirected +unweighted networks. See :cite:p:`vallaranoFastScalableLikelihood2021` +for details. + +See Also +-------- +UBCM : UBCM class + + +Examples +-------- + +.. testsetup:: ubcm + + import numpy as np + from pathcensus.nullmodels import UBCM + +.. doctest:: ubcm + + >>> # Make simple ER random graph using `igraph` + >>> import random + >>> import igraph as ig + >>> random.seed(101) + >>> G = ig.Graph.Erdos_Renyi(20, p=.2) + >>> # Initialize UBCM directly from the graph object + >>> ubcm = UBCM(G) + >>> # Alternatively, initialize from degree sequence array + >>> D = np.array(G.degree()) + >>> ubcm = UBCM(D).fit() + >>> # Check fit error + >>> round(ubcm.error, 6) + 0.0 + >>> # Mean absolute deviation of the fitted expected degree sequence + >>> # from the observed sequence + >>> (np.abs(ubcm.ED - ubcm.D) <= 1e-6).all() + True + >>> # Set seed of null model sampler and sample ensemble instance + >>> from pathcensus.utils import set_seed + >>> set_seed(17) + >>> ubcm.sample_one() + <20x20 sparse matrix of type '' + with 84 stored elements in Compressed Sparse Row format> + >>> # Sample multiple instances (generator) + >>> for instance in ubcm.sample(10): pass +""" +from typing import Callable +from types import MappingProxyType +import numpy as np +from numba import njit +from NEMtropy import UndirectedGraph +from .base import UndirectedSoftConfigurationModel +from .. import adjacency +from ..utils import rowsums +from ..types import GraphABC + + +class UBCM(UndirectedSoftConfigurationModel): + """Undirected Binary Configuration Model. + + This is a soft configuration model for undirected unweighted networks + which belongs to the family of Exponential Random Graph Models (ERGMs) + with local constraints. It induces a maximum entropy probability distribution + over a set of networks with :math:`N` nodes such that it yields a specific + degree sequence on average. + + Attributes + ---------- + statistics + 2D (float) array with sufficient statistics for nodes. + In this case there is only one sufficient statistic, that is, + the degree sequence. + fit_args + Dictionary with arguments used in the last call of :py:meth:`fit`. + ``None`` if the model has not been fitted yet. + + Notes + ----- + The following important class attributes are also defined: + + labels + Mapping from abbreviated labels to full names identifying sufficient + statistics. + models + Model names as defined in :py:mod:`NEMtropy` allowed for the specific + type of model. + """ + names = MappingProxyType({"degree": "x"}) + models = ("cm_exp", "cm") + # Default `fit` method keyword arguments + default_fit_kwds = MappingProxyType({"initial_guess": "chung_lu"}) + + @property + def fullname(self) -> str: + return "Undirected Binary Configuration Model" + + @property + def weighted(self) -> bool: + return False + + @property + def expected_statistics(self) -> np.ndarray: + """Expected sufficient statistics.""" + return self._get_param_array(self.solver.expected_dseq) + + @property + def D(self) -> np.ndarray: + """Observed degree sequence.""" + return self.get_stat("degree", expected=False) + + @property + def ED(self) -> np.ndarray: + """Expected degree sequence.""" + return self.get_stat("degree", expected=True) + + @property + def pijfunc(self) -> Callable: + r"""JIT-compiled routine for calculating :math:`p_{ij}`.""" + return ubcm_pij + + def extract_statistics(self, graph: GraphABC) -> np.ndarray: + """Extract sufficient statistics from a graph-like object.""" + A = adjacency(graph).copy() + A.data[:] = 1 + return rowsums(A) + + # NEMtropy wrapper methods ------------------------------------------------ + + def get_nemtropy_graph(self) -> UndirectedGraph: + """Get :py:mod:`NEMtropy` graph representation instance.""" + return UndirectedGraph(degree_sequence=self.D) + + +# Sampler routines ------------------------------------------------------------ + +@njit(boundscheck=False, nogil=True, cache=True) +def ubcm_pij(X: np.ndarray, i: int, j: int) -> float: + """Calculate edge probability :math:`p_{ij}` in UBCM model. + + Parameters + ---------- + X + 1D Array of model parameters. + i, j + Node indices. + """ + if i == j: + return 0 + xx = X[i]*X[j] + return xx / (1 + xx) diff --git a/pathcensus/nullmodels/uecm.py b/pathcensus/nullmodels/uecm.py new file mode 100644 index 0000000..0b73905 --- /dev/null +++ b/pathcensus/nullmodels/uecm.py @@ -0,0 +1,221 @@ +"""Undirected Enhanced Configuration Model (UECM) +induces a maximum entropy probability distribution over +networks of a given size such that it has specific expected +degree and strength sequences. It can be used to model undirected +weighted networks with edge weights being positive integers +(with no upper bound). See :cite:p:`vallaranoFastScalableLikelihood2021` +for details. + +See Also +-------- +UECM : UECM class + + +Examples +-------- + +.. testsetup:: uecm + + import numpy as np + from pathcensus.nullmodels import UECM + +.. doctest:: uecm + + >>> import random + >>> import igraph as ig + >>> # Make a ER random graph with random integer weights + >>> random.seed(27732) + >>> G = ig.Graph.Erdos_Renyi(20, p=.2) + >>> G.es["weight"] = np.random.randint(1, 11, G.ecount()) + >>> # Initialize UECM from the graph object + >>> uecm = UECM(G) + >>> # Alternatively initialize from an array of sufficient statistics + >>> # 1st column - degree sequence; 2nd column - strength sequence + >>> D = np.array(G.degree()) + >>> S = np.array(G.strength(weights="weight")) + >>> stats = np.column_stack([D, S]) + >>> uecm = UECM(stats).fit() + >>> # Check fit error + >>> round(uecm.error, 6) + 0.0 + >>> # Mean absolute deviation of the fitted expected degree sequence + >>> # from the observed sequence + >>> (np.abs(uecm.ED - uecm.D) <= 1e-6).all() + True + >>> # Mean absolute deviation of the fitted expected strength sequence + >>> # from the observed sequence + >>> (np.abs(uecm.ES - uecm.S) <= 1e-6).all() + True + >>> # Set seed of null model sampler and sample one instance + >>> from pathcensus.utils import set_seed + >>> set_seed(44) + >>> uecm.sample_one() + <20x20 sparse matrix of type '' + with 68 stored elements in Compressed Sparse Row format> + >>> # Sample multiple instances (generator) + >>> for instance in uecm.sample(10): pass +""" +from typing import Callable +from types import MappingProxyType +import numpy as np +from numba import njit +from NEMtropy import UndirectedGraph +from .base import UndirectedSoftConfigurationModel +from ..utils import rowsums +from ..types import GraphABC +from .. import adjacency + + + +class UECM(UndirectedSoftConfigurationModel): + """Undirected Enhanced Configuration Model. + + This is a soft configuration model for undirected weighted networks with + unbounded positive integer weights which belongs to the family + of Exponential Random Graph Models (ERGMs) with local constraints. + It induces a maximum entropy probability distribution over a set of + networks with :math:`N` nodes such that it yields a specific degree sequence + and a specific strenght sequence on average. + + Attributes + ---------- + statistics + 2D (float) array with sufficient statistics for nodes. + In this case there are two sufficient statistics, that is, + the degree sequence and the strength sequence. + fit_args + Dictionary with arguments used in the last call of :py:meth:`fit`. + ``None`` if the model has not been fitted yet. + + Notes + ----- + The following important class attributes are also defined: + + labels + Mapping from abbreviated labels to full names identifying sufficient + statistics. + models + Model names as defined in :py:mod:`NEMtropy` allowed for the specific + type of model. + """ + names = MappingProxyType({ "degree": "x", "strength": "y" }) + models = ("ecm_exp", "ecm") + # Default `fit` method keyword arguments + default_fit_kwds = MappingProxyType({"initial_guess": "strengths_minor"}) + + @property + def weighted(self) -> bool: + return True + + @property + def expected_statistics(self) -> np.ndarray: + """Expected sufficient statistics.""" + return np.column_stack([ + self.solver.expected_dseq, + self.solver.expected_strength_seq + ]) + + @property + def D(self) -> np.ndarray: + """Observed degree sequence.""" + return self.get_stat("degree", expected=False) + + @property + def ED(self) -> np.ndarray: + """Expected degree sequence.""" + return self.get_stat("degree", expected=True) + + @property + def S(self) -> np.ndarray: + """Observed strength sequence.""" + return self.get_stat("strength", expected=False) + + @property + def ES(self) -> np.ndarray: + """Expected strength sequence.""" + return self.get_stat("strength", expected=True) + + @property + def pijfunc(self) -> Callable: + """JIT-compiled routine for calculating :math:`p_{ij}`.""" + return uecm_pij + + @property + def wijfunc(self) -> Callable: + """JIT-compiled routine sampling :math:`w_{ij}`.""" + return uecm_wij + + @property + def Ewijfunc(self) -> Callable: + """JIT-compiled routing for calculating :math:`\\mathbb{E}[w_{ij}]` + (conditional on the edge being present). + """ + return uecm_Ewij + + def extract_statistics(self, graph: GraphABC) -> np.ndarray: + """Extract sufficient statistics from a graph-like object.""" + A = adjacency(graph).copy() + S = rowsums(A) + A.data[:] = 1 + D = rowsums(A) + return np.column_stack([ D, S ]) + + # NEMtropy wrapper methods ------------------------------------------------ + + def get_nemtropy_graph(self) -> UndirectedGraph: + """Get :py:mod:`NEMtropy` graph representation instance.""" + return UndirectedGraph(degree_sequence=self.D, strength_sequence=self.S) + + +# Sampler routines ------------------------------------------------------------ + +@njit(boundscheck=False, nogil=True, cache=True) +def uecm_pij(X: np.ndarray, i: int, j: int) -> float: + """Calculate edge probability :math:`p_{ij}` in UECM model. + + Parameters + ---------- + X + 1D array of model parameters. + i, j + Node indices. + """ + if i == j: + return 0 + n = len(X) // 2 + xx = X[i]*X[j] + yy = X[i+n]*X[j+n] + return xx*yy / (1 - yy + xx*yy) + +@njit(boundscheck=False, nogil=True, cache=True) +def uecm_wij(X: np.ndarray, i: int, j: int) -> int: + """Sample edge weight :math:`w_{ij}` in UECM model. + + Parameters + ---------- + X + 1D Array of model parameters. + i, j + Node indices. + """ + if i == j: + return 0 + n = len(X) // 2 + yy = X[i+n]*X[j+n] + return np.random.geometric(1-yy) + +@njit(boundscheck=False, nogil=True, cache=True) +def uecm_Ewij(X: np.ndarray, i: int, j: int) -> float: + """Calculate expected edge weight :math:`\\mathbb{E}[w_{ij}]` + (conditional on the edge being present) in UECM model. + + Parameters + ---------- + X + 1D array od model parameters. + i, j + Node indices. + """ + n = len(X) // 2 + yy = X[i+n]*X[j+n] + return 1 / (1-yy) diff --git a/pathcensus/pathcensus.py b/pathcensus/pathcensus.py new file mode 100644 index 0000000..b05b945 --- /dev/null +++ b/pathcensus/pathcensus.py @@ -0,0 +1,1235 @@ +""":class:`PathCensus` is the basis which structural similarity and +complementarity coefficients are derived from. In its raw form +it is a set of counts of wedge and head triples (2-paths) and quadruples +(3-paths) traversing an ``(i, j)`` edge as well as counts of corresponding +triangles (3-cycles) and quadrangles (4-cycles). +In the weighted case there are separate counts of triangles +and quadrangles for wedge and head paths as average weights are defined +differently for these two cases. + +.. note:: + Path/cycle counts and structural coefficients are returned + as :class:`pandas.Series` or :class:`pandas.DataFrame` objects indexed + properly with integer node indices corresponding to the ordering of + rows in the underlying adjacency matrix of the network. + +.. note:: + Path census calculations are relatively efficient as the main + workhorse functions are just-in-time (JIT) compiled to highly + optimized C code using :py:mod:`numba` package. Moreover, + the path census algorithm is based on a state-of-the-art + graphlet counting algorithm proposed by + :cite:t:`ahmedEfficientGraphletCounting2015` + which has worst-case asymptotic computational complexity of + :math:`O(md^2_{\\text{max}})`, where :math:`m` is the number + of edges and :math:`d_{\\text{max}}` is the maximum degree. + + Moreover, some additional optimizations are used to speed up + the calculations in the case of highly heterogeneous degree + distributions (e.g. power laws). See ``min_di`` argument in + :meth:`PathCensus.count_paths`. + +Node- and graph-level counts are derived from edge-level counts according +to simple aggregations rules. They are defined in definition classes +implemented in :py:mod:`pathcensus.definitions` submodule. + +.. seealso:: + + :class:`pathcensus.definitions.PathDefinitionsUnweighted` + for the naming scheme used for unweighted counts. + + :class:`pathcensus.definitions.PathDefinitionsWeighted` + for the naming scheme used for weighted counts. + +Below a node-level census for a simple triangle graph is counted. + +.. doctest:: census-triangle + + >>> import numpy as np + >>> from pathcensus import PathCensus + >>> G = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0]]) + >>> P = PathCensus(G) + >>> # Census calculations can be also parallelized + >>> # by default all available threads are used + >>> P = PathCensus(G, parallel=True) + >>> # But the number of threads can be set explicitly + >>> P = PathCensus(G, num_threads=2) + >>> P.census("nodes") + t tw th q0 qw qh + i + 0 1 2 2 0 0 0 + 1 1 2 2 0 0 0 + 2 1 2 2 0 0 0 + +Structural similarity +--------------------- + +Structural similarity coefficients (:meth:`PathCensus.similarity`) +as well as their corresponding clustering (:meth:`PathCensus.tclust`) +and closure coefficients (:meth:`PathCensus.tclosure`) +are defined quite simply in terms of ratios of 3-cycles (triangles) +to 2- (triples) counted at the levels of edges, nodes or globaly within an +entire graph. The figure below presents a summary of the underlying geometric +motivation as well as the main properties of structural similarity +coefficients, including the differences relative to local clustering and +closure coefficient +:cite:p:`wattsCollectiveDynamicsSmallworld1998,yinLocalClosureCoefficient2019`. + +.. figure:: /figures/sim.svg + :align: center + :alt: Overview of the properties of structural similarity coefficients + + Overview of the properties of structural similarity coefficients. + +Below node-wise structural similarity coefficients are counted for +a simple triangle graph. + +.. doctest:: simcoef-nodes + + >>> import numpy as np + >>> from pathcensus import PathCensus + >>> G = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0]]) + >>> P = PathCensus(G) + >>> P.similarity("nodes") + i + 0 1.0 + 1 1.0 + 2 1.0 + dtype: float64 + +Structural complementarity +-------------------------- + +Structural complementarity (:meth:`PathCensus.complementarity`) +coefficients and their corresponding clustering +(:meth:`PathCensus.qclust`) and closure coefficients +(:meth:`PathCensus.qclosure`) are defined in terms of ratios of 4-cycles +(quadrangles) to 3-paths (quadruples) and can be defined at the levels +of edges, nodes and entire graphs. The figure below present a summary +of the underlying geometric motivation and some of the main properties. + +.. figure:: /figures/comp.svg + :align: center + :alt: Overview of the properties of structural complementarity coefficients + + Overview of the properties of structural complementarity coefficients. + +Below we calculate complementarity coefficients for nodes +in a 4-clique graph. Note that complementarity coefficients are all +zeros as there are not quadrangles without any chordal edges. + +.. doctest:: compcoefs-nodes + + >>> import numpy as np + >>> from pathcensus import PathCensus + >>> G = np.array([[0,1,1,1],[1,0,1,1],[1,1,0,1],[1,1,1,0]]) + >>> P = PathCensus(G) + >>> P.complementarity("nodes") + i + 0 0.0 + 1 0.0 + 2 0.0 + 3 0.0 + dtype: float64 + +Weighted coefficients +--------------------- + +Structural coefficients can be also defined for weighted networks in which +case paths and cycles are weighted according to the arithmetic average +over edge weights defining an underlying path +(so closing or chordal edges in triangles/quadrangles are ignored). +This can be seen as an extension of the weighted clustering coefficient +proposed by :cite:t:`barratArchitectureComplexWeighted2004`. +Indeed, our formulation of the weighted clustering based on triangles +is equivalent to it. +The figure below presents a summary of the weighting rules. + +.. figure:: /figures/weighted.svg + :align: center + :alt: Overview of weighted path/cycle counts + + Overview of weighted path/cycle counts. + +Edge weights should be detected automatically in most cases provided that +a standard name of edge weight attribute (``"weight"``) is used. +However, weighted computations may be also enabled/disabled explicitly +by using ``weighted`` argument. + +.. doctest:: coefs-weighted + + >>> import numpy as np + >>> from pathcensus import PathCensus + >>> G = np.array([[0,2,3],[2,0,11],[3,11,0]]) + >>> PathCensus(G).census("nodes") + twc thc tw th q0wc q0hc qw qh + i + 0 2.5 6.75 5.0 13.5 0.0 0.0 0.0 0.0 + 1 6.5 4.75 13.0 9.5 0.0 0.0 0.0 0.0 + 2 7.0 4.50 14.0 9.0 0.0 0.0 0.0 0.0 + >>> PathCensus(G, weighted=False).census("nodes") + t tw th q0 qw qh + i + 0 1 2 2 0 0 0 + 1 1 2 2 0 0 0 + 2 1 2 2 0 0 0 +""" +from __future__ import annotations +from typing import Union, Literal, Optional, Any, Tuple, Dict +import numpy as np +from scipy.sparse import csr_matrix +import numba +import pandas as pd +from . import types, adjacency +from .core.graph import Graph +from .core.parallel import count_paths_parallel +from .types import UInt, Float +from .definitions import PathDefinitionsUnweighted, PathDefinitionsWeighted + + +class PathCensus: + """Path census and structural coefficients calculations + for undirected graphs. + + Attributes + ---------- + graph + :class:`pathcensus.core.graph.Graph` instance + for calculating path census. + counts + Data frame with path/cycle counts per edge. + Initialization may be postponed. + + Notes + ----- + Naming scheme used for denoting counts is documented in the docstring + for ``definitions`` attribute (i.e. ``self.definitions``). + + .. testsetup:: pathcensus + + import numpy as np + from pathcensus import PathCensus + """ + # pylint: disable=too-many-public-methods + class Meta: + """Container class with various metadata such + as lists of possible values of arguments of different + methods of :class:`PathCensus`. + + **Fields** + + mode + Allowed values of ``mode`` argument in structural + coefficients methods. + undef + Allowed values of ``undefined`` argument in structural + coefficients methods. + """ + mode = ("nodes", "edges", "global") + undef = ("nan", "zero") + + def __init__( + self, + graph: types.GraphABC, + weighted: Optional[bool] = None, + validate: bool = True, + adj_kws: Optional[Dict] = None, + count_paths: bool = True, + **kwds: Any + ) -> None: + """Initialization method. + + Parameters + ---------- + graph + Graph-like object registered with + :py:class:`pathcensus.types.GraphABC` abstract base class + and a registered single dispatch method registered on + :py:class:`pathcensus.utils.adjacency`. Sparse matrices + work out of the box. + weighted + Should the graph be interpreted as weighted. + Determined automatically if ``None``. + validate + Should input graph be validate for correctness + (i.e. checked if is undirected). + adj_kws + Additional keyword params passed to + :py:func:`pathcensus.utils.adjacency`. + count_paths + Should `counts` attribute be initialized immediately. + It can be initialized later using :py:meth:`count` method. + **kwds + Passed to :py:meth:`count_paths`. + """ + adj_kws = adj_kws or {} + graph_kws = dict(weighted=weighted, validate=validate, **adj_kws) + self.graph = self.get_graph(graph, **graph_kws) + + # Setup path definition objects + if self.weighted: + self.definitions = PathDefinitionsWeighted() + else: + self.definitions = PathDefinitionsUnweighted() + + self.counts = None + if count_paths: + self.count(**kwds) + + def count(self, **kwds: Any) -> None: + """Count paths and set `self.counts` attribute. + + ``**kwds`` are passed to :py:meth:`count_paths`. + """ + E, counts = self.count_paths(self.graph, **kwds) + self.counts = self._make_counts(E, counts) + + # Properties -------------------------------------------------------------- + + @property + def n_nodes(self) -> int: + return self.graph.n_nodes + + @property + def vcount(self) -> int: + return self.n_nodes + + @property + def n_edges(self) -> int: + return self.counts.shape[0] + @property + def ecount(self) -> int: + return self.n_edges + + @property + def weighted(self) -> bool: + return self.graph.weighted + + @property + def degree(self) -> np.ndarray: + """Get degree sequence of the underlying graph.""" + return self.graph.D + + @property + def strength(self) -> np.ndarray: + """Get strength sequence of the underlying graph + (or degree sequence in the unweighted case). + """ + if self.weighted: + return self.graph.S + return self.graph.D + + @property + def tdf(self) -> pd.DataFrame: + """Data frame with triple/triangle counts per edge.""" + cols = [ + name for name in self.definitions.get_column_names() + if name in self.definitions["sim"] + ] + return self.counts[cols] + + @property + def qdf(self) -> pd.DataFrame: + """Data frame with quadruple/quadrangle counts per edge.""" + cols = [ + name for name in self.definitions.get_column_names() + if name in self.definitions["comp"] + ] + return self.counts[cols] + + # Static & class methods -------------------------------------------------- + + @classmethod + def get_graph( + cls, + graph: types.GraphABC, + weighted: Optional[bool] = None, + validate: bool = True, + **kwds: Any + ) -> Graph: + """Get graph object for path counting. + + Parameters + ---------- + graph + A compatibe graph object registered with `paths.types.GraphABC`. + n_nodes + Number of nodes. Used only when `graph` is passed as an edgelist. + weighted + Should the graph be interpreted as weighted graph. + If ``None`` then it is determined based on the number of + unique values of non-zero values in the adjacency matrix. + validate + Should input graph be validate for correctness + (i.e. checked if is undirected). + **kwds + Passed to :py:func:`pathcensus.utils.adjacency`. + """ + A = adjacency(graph, **kwds) + + if validate and (A != A.T).count_nonzero() > 0: + raise AttributeError("only undirected graphs are accepted") + + n_nodes = A.shape[0] + E = np.ascontiguousarray(np.array(A.nonzero(), dtype=UInt).T) + + if weighted is None: + weighted = any(A.data != 1) + if weighted: + W = A.data.astype(Float) + else: + W = None + + G = Graph(n_nodes, E, W) + return G + + @classmethod + def count_paths( + cls, + graph: Union[types.GraphABC, Graph], + *, + parallel: Optional[bool] = None, + num_threads: Optional[int] = None, + graph_kws: Optional[Dict] = None, + min_di: bool = True, + **kwds: Any + ) -> Tuple[int, np.ndarray]: + """Count paths and cycles in a graph. + + Parameters + ---------- + graph + :py:class:`pathcensus.core.graph.Graph` instance. + or graph-like object that can be converted to it. + + parallel + Should parallel counting algorithm be used. + When ``None`` it is used by default for graphs + with at least one million edges. + num_threads + Number of threads to use when ``parallel=True``. + batch_size + Batch size to use when running with ``parallel=True``. + graph_kws + Additional keyword arguments passed to :py:meth:`get_graph`. + Used only when `graph` is not already in the JIT-compiled form. + min_di + Should `di < dj` rule for iterating over edges be used. + This way the most expensive loop of the `PathCensus` algorithm + for computing edge-wise path/cycle counts always iterates over + neighbors of the lower degree node in an ``(i, j)`` edge. + Almost always should be set to ``True``. + The argument is used mostly for testing purposes. + **kwds + Passed to :py:func:`pathcensus.core.parallel.count_paths_parallel` + when ``parallel=True``. + + Returns + ------- + n_nodes + Number of nodes. + counts + Path and cycles counts. + """ + if isinstance(graph, types.GraphABC): + graph = cls.get_graph(graph, **(graph_kws or {})) + + if min_di: + E = graph.get_min_di_edges() + else: + E = graph.get_edges() + + if parallel is None: + # Use parallel algorithm when at least 100k edges + parallel = graph.n_edges >= 1e5 + if not num_threads or num_threads <= 0: + # pylint: disable=no-member + num_threads = numba.config.NUMBA_NUM_THREADS + if parallel and num_threads > 1: + orig_num_threads = numba.get_num_threads() + numba.set_num_threads(num_threads) + try: + E, counts = count_paths_parallel(graph, **kwds) + finally: + numba.set_num_threads(orig_num_threads) + else: + E, counts = graph.count_paths(E) + return E, counts + + # Auxiliary methods ------------------------------------------------------- + + def get_counts( + self, + mode: Literal[Meta.mode] = Meta.mode[0], # type: ignore + ) -> pd.DataFrame: + """Get (possibly aggregated) path counts. + + Parameters + ---------- + mode + Should node, edge or global counts be calculated. + """ + self._check_mode(mode) + counts = self.counts + + if mode == "nodes": + counts = counts.groupby(level="i") \ + .sum() \ + .reindex(np.arange(self.n_nodes), copy=False) \ + .fillna(0) + elif mode == "global": + counts = counts.sum().to_frame().T + + return counts + + # Similarity coefficients ------------------------------------------------- + + def tclust( + self, + *, + undefined: Literal[Meta.undef] = Meta.undef[0], # type: ignore + counts: Optional[pd.DataFrame] = None + ) -> pd.Series: + """Triangle-based local clustering (node-wise). + + It is equivalent to local clustering coefficient + :cite:p:`wattsCollectiveDynamicsSmallworld1998`. + + Parameters + ---------- + undefined + If ``'nan'`` the nodes with undefined values are treated + as NaNs. If ``'zero'`` then they are considered zeros. + counts + Path counts data frame to use. Mostly for internal use. + + Notes + ----- + It is defined as the ratio of triangles including a focal node ``i`` + to the number of wedge triples centered at it: + + .. math:: + + s^W_i = \\frac{2T_i}{t^W_i} + + .. figure:: /figures/t-wedge.svg + :align: center + :alt: Wedge triple + + Wedge triple. + + Examples + -------- + + .. doctest:: pathcensus + + >>> # Triangle graph + >>> A = np.array([[0,1,1], [1,0,1], [1,1,0]]) + >>> PathCensus(A).tclust() + i + 0 1.0 + 1 1.0 + 2 1.0 + dtype: float64 + """ + df = counts if counts is not None else self.get_counts("nodes") + num = df[self._a("twc")] + denom = df[self._a("tw")] + return self._divide(num, denom, undefined=undefined) + + def tclosure( + self, + *, + undefined: Literal[Meta.undef] = Meta.undef[0], # type: ignore + counts: Optional[pd.DataFrame] = None + ) -> pd.Series: + """Triangle-based local closure coefficient (node-wise). + + It is equivalent to local closure coefficient + :cite:p:`yinLocalClosureCoefficient2019`. + + Parameters + ---------- + undefined + If ``'nan'`` the nodes with undefined values are treated + as NaNs. If ``'zero'`` then they are considered zeros. + counts + Path counts data frame to use. Mostly for internal use. + + Notes + ----- + It is defined as the ratio of the number of triangles including + a focal node ``i`` to the number of head triples starting from it: + + .. math:: + + s^H_i = \\frac{2T_i}{t^H_i} + + .. figure:: /figures/t-head.svg + :align: center + :alt: Head triple + + Head triple. + + Examples + -------- + + .. doctest:: pathcensus + + >>> # Triangle graph + >>> A = np.array([[0,1,1],[1,0,1],[1,1,0]]) + >>> PathCensus(A).tclosure() + i + 0 1.0 + 1 1.0 + 2 1.0 + dtype: float64 + """ + df = counts if counts is not None else self.get_counts("nodes") + num = df[self._a("thc")] + denom = df[self._a("th")] + return self._divide(num, denom, undefined=undefined) + + def similarity( + self, + mode: Literal[Meta.mode] = Meta.mode[0], # type: ignore + *, + undefined: Literal[Meta.undef] = Meta.undef[0], # type: ignore + counts: Optional[pd.DataFrame] = None + ) -> Union[pd.Series, float]: + """Structural similarity coefficients. + + Parameters + ---------- + mode + Should it be calculated for nodes, edges or globally + (equivalent to global clustering). + undefined + If ``'nan'`` the nodes with undefined values are treated + as NaNs. If ``'zero'`` then they are considered zeros. + counts + Path counts data frame to use. Mostly for internal use. + + Notes + ----- + It is defined as the ratio of triangles including a focal node ``i`` + to the total number of both wedge and head triples: + + .. math:: + + s_i = \\frac{4T_i}{t^W_i + t^H_i} + + See Also + -------- + simcoefs : structural similarity coefficients + coefs : structural coefficients + + Examples + -------- + + .. doctest:: pathcensus + + >>> # Triangle graph + >>> A = np.array([[0,1,1], [1,0,1], [1,1,0]]) + >>> PathCensus(A).similarity("edges") + i j + 0 1 1.0 + 2 1.0 + 1 0 1.0 + 2 1.0 + 2 0 1.0 + 1 1.0 + dtype: float64 + >>> PathCensus(A).similarity("nodes") + i + 0 1.0 + 1 1.0 + 2 1.0 + dtype: float64 + >>> PathCensus(A).similarity("global") + 1.0 + """ + df = counts if counts is not None else self.get_counts(mode) + num = df[self._a("twc")] + df[self._a("thc")] + denom = df[self._a("tw")] + df[self._a("th")] + return self._divide(num, denom, undefined=undefined) + + # Complementarity coefficients -------------------------------------------- + + def qclust( + self, + *, + undefined: Literal[Meta.undef] = Meta.undef[0], # type: ignore + counts: Optional[pd.DataFrame] = None + ) -> pd.Series: + """Quadrangle-based local clustering coefficient (node-wise). + + Parameters + ---------- + undefined + If ``'nan'`` the nodes with undefined values are treated + as NaNs. If ``'zero'`` then they are considered zeros. + counts + Path counts data frame to use. Mostly for internal use. + + Notes + ----- + It is defined as the ratio of quadrangles including a focal node ``i`` + and the number of wedge quadruples with ``i`` at the second position + (this is to avoid double counting and make the number of wedge and + head quadruples per quadrangle equal): + + .. math:: + + c^W_i = \\frac{2Q_i}{q^W_i} + + .. figure:: /figures/q-wedge.svg + :align: center + :alt: Wedge quadruple + + Wedge quadruple. + + Examples + -------- + + .. doctest:: pathcensus + + >>> # Quadrangle graph + >>> A = np.array([[0,1,0,1],[1,0,1,0],[0,1,0,1],[1,0,1,0]]) + >>> PathCensus(A).qclust() + i + 0 1.0 + 1 1.0 + 2 1.0 + 3 1.0 + dtype: float64 + """ + df = counts if counts is not None else self.get_counts("nodes") + num = self._qcount(df, which="wedge") + denom = df[self._a("qw")] + return self._divide(num, denom, undefined=undefined) + + def qclosure( + self, + *, + undefined: Literal[Meta.undef] = Meta.undef[0], # type: ignore + counts: Optional[pd.DataFrame] = None + ) -> pd.Series: + """Quadrangle-based local closure coefficient. + + Parameters + ---------- + undefined + If ``'nan'`` the nodes with undefined values are treated + as NaNs. If ``'zero'`` then they are considered zeros. + counts + Path counts data frame to use. Mostly for internal use. + + Notes + ----- + It is defined as the ratio of quadrangles including a focal node ``i`` + and the number of head quadruples starting from it. + + .. math:: + + c^H_i = \\frac{2Q_i}{q^H_i} + + .. figure:: /figures/q-head.svg + :align: center + :alt: Head quadruple + + Head quadruple. + + Examples + -------- + + .. doctest:: pathcensus + + >>> # Quadrangle graph + >>> A = np.array([[0,1,0,1],[1,0,1,0],[0,1,0,1],[1,0,1,0]]) + >>> PathCensus(A).qclosure() + i + 0 1.0 + 1 1.0 + 2 1.0 + 3 1.0 + dtype: float64 + """ + df = counts if counts is not None else self.get_counts("nodes") + num = self._qcount(df, which="head") + denom = df[self._a("qh")] + return self._divide(num, denom, undefined=undefined) + + def complementarity( + self, + mode: Literal[Meta.mode] = Meta.mode[0], # type: ignore + *, + undefined: Literal[Meta.undef] = Meta.undef[0], # type: ignore + counts: Optional[pd.DataFrame] = None + ) -> Union[pd.Series, float]: + """Structural complementarity coefficients. + + Parameters + ---------- + mode + Should it be calculated for nodes, edges or globally + (equivalent to global clustering). + undefined + If ``'nan'`` the nodes with undefined values are treated + as NaNs. If ``'zero'`` then they are considered zeros. + counts + Path counts data frame to use. Mostly for internal use. + + Notes + ----- + The node-wise coefficient is defined as the ratio of quadrangles + including a focal node ``i`` and the total number of both wedge + and head quadruples: + + .. math:: + + c_i = \\frac{4Q_i}{q^W_i + q^H_i} + + The edge-wise coefficient is defined as the ratio of quadrangles + including an ``(i, j)`` edge and the number of quadruples starting + at it: + + .. math:: + + c_{ij} = \\frac{2Q_{ij}}{q_{ij}} + + The global coefficient is defined as the ratio of sums of quadrangles + to the sum of quadruples (wedge or head): + + .. math:: + + c + = \\frac{2\\sum_i Q_i}{\\sum_i q^W_i} + = \\frac{2\\sum_i Q_i}{\\sum_i q^H_i} + + See Also + -------- + compcoefs : structural complementarity coefficients + coefs : structural coefficients + + Examples + -------- + + .. doctest:: pathcensus + + >>> # Quadrangle graph + >>> A = np.array([[0,1,0,1],[1,0,1,0],[0,1,0,1],[1,0,1,0]]) + >>> PathCensus(A).complementarity("edges") + i j + 0 1 1.0 + 3 1.0 + 1 0 1.0 + 2 1.0 + 2 1 1.0 + 3 1.0 + 3 0 1.0 + 2 1.0 + dtype: float64 + >>> PathCensus(A).complementarity("nodes") + i + 0 1.0 + 1 1.0 + 2 1.0 + 3 1.0 + dtype: float64 + >>> PathCensus(A).complementarity("global") + 1.0 + """ + df = counts if counts is not None else self.get_counts(mode) + num = self._qcount(df, which="wedge") \ + + self._qcount(df, which="head") + denom = df[self._a("qw")] + df[self._a("qh")] + return self._divide(num, denom, undefined=undefined) + + # Summaries --------------------------------------------------------------- + + def simcoefs( + self, + mode: Literal[Meta.mode] = Meta.mode[0], # type: ignore + *, + undefined: Literal[Meta.undef] = Meta.undef[0], # type: ignore + census: bool = False, + counts: Optional[pd.DataFrame] = None + ) -> pd.DataFrame: + """Calculate similarity coefficients including clustering + and closure coefficients when ``mode="nodes"`` or their + node-wise averages when ``mode="global"``. + + Parameters + ---------- + mode + Should node, edge or global counts be calculated. + undefined + If ``'nan'`` the nodes with undefined values are treated + as NaNs. If ``'zero'`` then they are considered zeros. + census + If ``True`` then path census data is added. + as columns in the front of the data frame. + counts + Path counts data frame to use. Mostly for internal use. + + Examples + -------- + + .. doctest:: pathcensus + + >>> # Triangle graph + >>> A = np.array([[0,1,1], [1,0,1], [1,1,0]]) + >>> PathCensus(A).simcoefs("edges") + sim + i j + 0 1 1.0 + 2 1.0 + 1 0 1.0 + 2 1.0 + 2 0 1.0 + 1 1.0 + >>> PathCensus(A).simcoefs("nodes") + sim tclust tclosure + i + 0 1.0 1.0 1.0 + 1 1.0 1.0 1.0 + 2 1.0 1.0 1.0 + >>> PathCensus(A).simcoefs("global") + sim_g sim tclust tclosure + 0 1.0 1.0 1.0 1.0 + """ + counts = counts if counts is not None else self.get_counts(mode) + kwds = dict(undefined=undefined) + + if mode == "edges": + coefs = pd.DataFrame({ + "sim": self.similarity(mode, counts=counts, **kwds), + }, index=counts.index) + elif mode == "nodes": + coefs = pd.DataFrame({ + "sim": self.similarity(mode, counts=counts, **kwds), + "tclust": self.tclust(counts=counts, **kwds), + "tclosure": self.tclosure(counts=counts, **kwds), + }) + else: + coefs = self.simcoefs(mode="nodes", **kwds).mean().to_frame().T + coefs.insert(0, "sim_g", self.similarity(mode, counts=counts, **kwds)) + + if census: + paths = self.census(mode) + coefs = pd.concat([coefs, paths], axis=1) + + return coefs + + def compcoefs( + self, + mode: Literal[Meta.mode] = Meta.mode[0], # type: ignore + *, + undefined: Literal[Meta.undef] = Meta.undef[0], # type: ignore + census: bool = False, + counts: Optional[pd.DataFrame] = None + ) -> pd.DataFrame: + """Calculate complementarity coefficients including clustering + and closure coefficients when ``mode="nodes"`` or their + node-wise averages when ``mode="global"``. + + Parameters + ---------- + mode + Should node, edge or global counts be calculated. + undefined + If ``'nan'`` the nodes with undefined values are treated + as NaNs. If ``'zero'`` then they are considered zeros. + census + If ``True`` then path census data is added. + counts + Path counts data frame to use. Mostly for internal use. + + Examples + -------- + + .. doctest:: pathcensus + + >>> # Quadrangle graph + >>> A = np.array([[0,1,0,1],[1,0,1,0],[0,1,0,1],[1,0,1,0]]) + >>> PathCensus(A).compcoefs("edges") + comp + i j + 0 1 1.0 + 3 1.0 + 1 0 1.0 + 2 1.0 + 2 1 1.0 + 3 1.0 + 3 0 1.0 + 2 1.0 + >>> PathCensus(A).compcoefs("nodes") + comp qclust qclosure + i + 0 1.0 1.0 1.0 + 1 1.0 1.0 1.0 + 2 1.0 1.0 1.0 + 3 1.0 1.0 1.0 + >>> PathCensus(A).compcoefs("global") + comp_g comp qclust qclosure + 0 1.0 1.0 1.0 1.0 + """ + counts = counts if counts is not None else self.get_counts(mode) + kwds = dict(undefined=undefined) + + if mode == "edges": + coefs = pd.DataFrame({ + "comp": self.complementarity(mode, counts=counts, **kwds), + }, index=counts.index) + elif mode == "nodes": + coefs = pd.DataFrame({ + "comp": self.complementarity(mode, counts=counts, **kwds), + "qclust": self.qclust(counts=counts, **kwds), + "qclosure": self.qclosure(counts=counts, **kwds) + }) + else: + coefs = self.compcoefs(mode="nodes", **kwds).mean().to_frame().T + coefs.insert(0, "comp_g", self.complementarity(mode, counts=counts, **kwds)) + + if census: + paths = self.census(mode) + coefs = pd.concat([coefs, paths], axis=1) + + return coefs + + def coefs( + self, + mode: Literal[Meta.mode] = Meta.mode[0], # type: ignore + **kwds + ) -> pd.DataFrame: + """Calculate structural coefficients. + + Parameters + ---------- + mode + Should node, edge or global counts be calculated. + undefined + If ``'nan'`` the nodes with undefined values are treated + as NaNs. If ``'zero'`` then they are considered zeros. + census + If ``True`` then path census data is added. + counts + Path counts data frame to use. Mostly for internal use. + + See Also + -------- + simcoefs : structural similarity coefficients + compcoefs: structural complementarity coefficients + """ + if "counts" not in kwds: + kwds["counts"] = self.get_counts(mode) + + census = kwds.pop("census", False) + + skw = kwds.copy() + ckw = kwds + + scoefs = self.simcoefs(mode, **skw) + ccoefs = self.compcoefs(mode, **ckw) + coefs = pd.concat([scoefs, ccoefs], axis=1) + + if census: + paths = self.census(mode) + coefs = pd.concat([coefs, paths], axis=1) + + return coefs + + def census( + self, + mode: Literal[Meta.mode] = Meta.mode[0], # type: ignore + *, + counts: Optional[pd.DataFrame] = None + ) -> pd.DataFrame: + """Calculate path census. + + Parameters + ---------- + mode + Should node, edge or global counts be calculated. + counts + Path counts data frame to use. Mostly for internal use. + + Examples + -------- + + .. doctest:: pathcensus + + >>> # Triangle graph + >>> A = np.array([[0,1,1], [1,0,1], [1,1,0]]) + >>> PathCensus(A).census() + t tw th q0 qw qh + i + 0 1 2 2 0 0 0 + 1 1 2 2 0 0 0 + 2 1 2 2 0 0 0 + """ + self._check_mode(mode) + if counts is None: + counts = self.get_counts(mode).copy() + + arules = self.definitions.aggregation.get(mode, {}) + for k, v in arules.items(): + if self.weighted: + counts[k] /= v + else: + counts[k] //= v + + return counts + + # Serialization ----------------------------------------------------------- + + def dump(self) -> Tuple[ + int, np.ndarray, Optional[np.ndarray], Optional[np.ndarray] + ]: + """Dump to raw data in the form of arrays and the number of nodes. + + Returns + ------- + n_nodes + Number of nodes. + E + Edgelist array. + W + Optional edge weights array. + counts + Path counts array. May be ``None``. + """ + return self.n_nodes, self.graph.E, self.graph.W, self.counts + + @classmethod + def from_dump( + cls, + n_nodes: int, + E: np.ndarray, + W: Optional[np.ndarray] = None, + counts: Optional[np.ndarray] = None, + adj_kws: Optional[Dict] = None, + **kwds: Any + ) -> PathCensus: # type: ignore + """Construct from the output of :py:meth:`dump`. + + Parameters + ---------- + n_nodes + Number of nodes. + E + Edgelist array. + W + Optional edge weights array. + counts + Optional path counts array. It is calculated on-the-fly + when ``None``. + adj_kws + Passed to :py:func:`pathcensus.utils.adjacency`. + **kwds + Passed to :py:meth:`count_paths` + when `counts` is ``None``. + """ + if W is None: + weighted = False + W = np.full(len(E), 1) + else: + weighted = True + + i, j = E[:, 1:].T + A = csr_matrix((W, (i, j)), shape=(n_nodes, n_nodes)) + adj_kws = adj_kws or {} + paths = cls(A, weighted=weighted, adj_kws=adj_kws, count_paths=False) + if counts is None: + paths.count(**kwds) + else: + paths.counts = counts + return paths + + # Internals --------------------------------------------------------------- + + def _a(self, name: str) -> str: + """Resolve possibly aliased path name.""" + return self.definitions.resolve(name) + + def _make_counts( + self, + E: np.ndarray, + counts: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: + """Make path counts data frame.""" + swaps = self.definitions.get_swap_rules() + cols = self.definitions.get_column_ids() + names = self.definitions.get_column_names() + + E2 = E.copy() + E2[:, [0, 1]] = E[:, [1, 0]] + E = np.vstack((E, E2)) + + if not self.weighted: + counts = counts.astype(UInt) + + counts2 = counts.copy() + for u, v in swaps: + counts2[:, [u, v]] = counts[:, [v, u]] + counts = np.vstack((counts, counts2)) + + # Drop unnecessary columns + counts = counts[:, cols] + + counts = pd.DataFrame( + data=counts, + columns=names, + index=pd.MultiIndex.from_arrays( + arrays=E.T, + names=["i", "j"] + ) + ) + return counts.sort_index() + + def _check_undef(self, val: str) -> None: + if val not in self.Meta.undef: + raise ValueError(f"'undefined' has to be one of {self.Meta.undef}") + + def _check_mode(self, val: str) -> None: + if val not in self.Meta.mode: + raise ValueError(f"'mode' has to be one of {self.Meta.mode}") + + def _divide( + self, + x: Union[int, float], + y: Union[int, float], + *, + undefined: Literal[Meta.undef] = Meta.undef[0], # type: ignore + ) -> float: + with np.errstate(invalid="ignore"): + out = x / y + if undefined == "zero": + out[np.isnan(out) | np.isinf(out)] = 0.0 + else: + out[np.isinf(out)] = np.nan + if out.size == 1: + out = out.iloc[0] + return out + + def _qcount( + self, + df: pd.DataFrame, + which: Literal["wedge", "head"], + ) -> pd.Series: + """Get quadrangle count.""" + if which == "wedge": + col = "q0wc" + elif which == "head": + col = "q0hc" + else: + raise ValueError("incorrect 'which' value") + + q = df[self._a(col)].copy() + return q + + def _rev_index(self, s: pd.Series) -> pd.Series: + s = s.copy() + s.index.names = s.index.names[::-1] + return s.swaplevel() diff --git a/pathcensus/types.py b/pathcensus/types.py new file mode 100644 index 0000000..059e5c4 --- /dev/null +++ b/pathcensus/types.py @@ -0,0 +1,11 @@ +"""Custom type defintions.""" +# pylint: disable=unused-import +from typing import Union +import numpy as np +import pandas as pd +from .graph import GraphABC # type: ignore + +Data = Union[pd.Series, pd.DataFrame] + +UInt = np.uint +Float = np.dtype(float).type diff --git a/pathcensus/utils.py b/pathcensus/utils.py new file mode 100644 index 0000000..33f1d00 --- /dev/null +++ b/pathcensus/utils.py @@ -0,0 +1,69 @@ +"""Utility functions.""" +from typing import Union, Optional +import random as _random +import numpy as np +from scipy.sparse import isspmatrix, spmatrix +from .core.random import set_numba_seed +from . import adjacency # pylint: disable=unused-import + + +def set_seed( + all: Optional[int] = None, + *, + random: Optional[int] = None, + numpy: Optional[int] = None, + numba: Optional[int] = None, +) -> None: + """Set seeds of random number generators. + + Parameters + ---------- + random + Seed value for :py:mod:`random` generator. + numpy + Seed value for :py:mod:`numpy` generator. + numba + Seed value for py:mod:`numba` generator. + all + Seed value used for all generators. + Cannot be used jointly with other arguments. + + Raises + ------ + ValueError + If 'all' is used with other arguments or no seed is set. + """ + # pylint: disable=redefined-builtin + any_seed = random is not None or numpy is not None or numba is not None + if all is not None and any_seed: + raise ValueError("'all' cannot be used with other arguments") + if all is None and not any_seed: + raise ValueError("no random generator module selected") + + if all is not None: + random = numpy = numba = all + + if random is not None: + _random.seed(random) + if numpy is not None: + np.random.seed(numpy) + if numba is not None: + set_numba_seed(numba) + +def rowsums(X: Union[np.ndarray, spmatrix]) -> np.ndarray: + """Calculate row sums of a matrix.""" + if isspmatrix(X): + return np.array(X.sum(1)).flatten() + return X.sum(1) + +def relerr(x1: np.ndarray, x2: np.ndarray) -> np.ndarray: + """Relative error ``|(x1 - x2)| / |x2|``.""" + return np.abs(x1 - x2) / np.abs(x2) + +def relclose(x1: np.ndarray, x2: np.ndarray, rtol: float = 1e-6) -> np.ndarray: + """Are two arrays relatively close. + + ``rtol`` defines the maximum allowed relative difference + between ``x1`` and ``x2`` relative to the magnitude of ``x2``. + """ + return (relerr(x1, x2) <= rtol).all() diff --git a/pylintrc b/pylintrc new file mode 100644 index 0000000..7673e43 --- /dev/null +++ b/pylintrc @@ -0,0 +1,581 @@ +[MASTER] + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-whitelist= + +# Add files or directories to the blacklist. They should be base names, not +# paths. +ignore=CVS,.git,.vscode,docs + +# Add files or directories matching the regex patterns to the blacklist. The +# regex matches against base names, not paths. +ignore-patterns= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python modules names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Specify a configuration file. +#rcfile= + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. +confidence= + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then reenable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=E201, + E202, + E226, + E302, + E731, + C0111, + C0326, + R0201, + R0901, + cyclic-import, + duplicate-code, + invalid-name, + too-many-lines, + too-few-public-methods, + print-statement, + parameter-unpacking, + unpacking-in-except, + old-raise-syntax, + backtick, + long-suffix, + old-ne-operator, + old-octal-literal, + import-star-module-level, + non-ascii-bytes-literal, + raw-checker-failed, + bad-inline-option, + locally-disabled, + locally-enabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + apply-builtin, + basestring-builtin, + buffer-builtin, + cmp-builtin, + coerce-builtin, + execfile-builtin, + file-builtin, + long-builtin, + raw_input-builtin, + reduce-builtin, + standarderror-builtin, + unicode-builtin, + xrange-builtin, + coerce-method, + delslice-method, + getslice-method, + setslice-method, + no-absolute-import, + old-division, + dict-iter-method, + dict-view-method, + next-method-called, + metaclass-assignment, + indexing-exception, + raising-string, + reload-builtin, + oct-method, + hex-method, + nonzero-method, + cmp-method, + input-builtin, + round-builtin, + intern-builtin, + unichr-builtin, + map-builtin-not-iterating, + zip-builtin-not-iterating, + range-builtin-not-iterating, + filter-builtin-not-iterating, + using-cmp-argument, + eq-without-hash, + div-method, + idiv-method, + rdiv-method, + exception-message-attribute, + invalid-str-codec, + sys-max-int, + bad-python3-import, + deprecated-string-function, + deprecated-str-translate-call, + deprecated-itertools-function, + deprecated-types-field, + next-method-defined, + dict-items-not-iterating, + dict-keys-not-iterating, + dict-values-not-iterating, + deprecated-operator-function, + deprecated-urllib-function, + xreadlines-attribute, + deprecated-sys-function, + exception-escape, + comprehension-escape + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[REPORTS] + +# Python expression which should return a note less than 10 (10 is the highest +# note). You have access to the variables errors warning, statement which +# respectively contain the number of errors / warnings messages and the total +# number of statements analyzed. This is used by the global evaluation report +# (RP0004). +evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +#msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +output-format=text + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit + + +[LOGGING] + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid to define new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. Default to name +# with leading underscore. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io + + +[SIMILARITIES] + +# Ignore comments when computing similarities. +ignore-comments=yes + +# Ignore docstrings when computing similarities. +ignore-docstrings=yes + +# Ignore imports when computing similarities. +ignore-imports=no + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=120 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# List of optional constructs for which whitespace checking is disabled. `dict- +# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. +# `trailing-comma` allows a space between comma and closing bracket: (a, ). +# `empty-line` allows space-only lines. +no-space-check=trailing-comma, + dict-separator + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata, + dupa + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. +#class-attribute-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=UPPER_CASE + +# Regular expression matching correct constant names. Overrides const-naming- +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=x,y,z, + m,n, + xx,yy,zz, + i,j,k,l, + ri,rj,rk,rl, + u,v,p,r,s, + _, + wi,wj,wk,wl, + di,dj,dk,dl, + t,th,tw,q,qh,qw,q0,q1,q2, + D,S,E,G,X,Y,U,V,W, + ED,ES,EX,EY,EU,EV, + ex, + Run, + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. +#variable-rgx= + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. Available dictionaries: none. To make it working +# install python-enchant package.. +spelling-dict= + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to indicated private dictionary in +# --spelling-private-dict-file option instead of raising a message. +spelling-store-unknown-words=no + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether missing members accessed in mixin class should be ignored. A +# mixin class is detected if its name ends with "mixin" (case insensitive). +ignore-mixin-members=yes + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis. It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + + +[CLASSES] + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict, + _fields, + _replace, + _source, + _make + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=cls + + +[IMPORTS] + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules=optparse,tkinter.tix + +# Create a graph of external dependencies in the given file (report RP0402 must +# not be disabled). +ext-import-graph= + +# Create a graph of every (i.e. internal and external) dependencies in the +# given file (report RP0402 must not be disabled). +import-graph= + +# Create a graph of internal dependencies in the given file (report RP0402 must +# not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + + +[DESIGN] + +# Maximum number of arguments for function / method. +max-args=10 + +# Maximum number of attributes for a class (see R0902). +max-attributes=10 + +# Maximum number of boolean expressions in an if statement. +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=12 + +# Maximum number of locals for function / method body. +max-locals=15 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=1 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when being caught. Defaults to +# "Exception". +overgeneral-exceptions=Exception diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2168ad6 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,24 @@ +[build-system] +requires = [ + "setuptools>=52.0", + "wheel>=0.37" +] +build-backend = "setuptools.build_meta" + + +# PyTest configuration +[tool.pytest.ini_options] +testpaths = [ + "tests", + "pathcensus" +] +addopts = [ + "-s", + "--pdbcls=IPython.terminal.debugger:Pdb", + "--doctest-glob=*.py", + "--doctest-modules" +] +doctest_optionflags = [ "NORMALIZE_WHITESPACE" ] +python_files = [ "test_*.py" ] +python_classes = [ "Test*" ] +python_functions = [ "test_*" ] diff --git a/requirements-docs.txt b/requirements-docs.txt new file mode 100644 index 0000000..e58f60c --- /dev/null +++ b/requirements-docs.txt @@ -0,0 +1,3 @@ +Sphinx>=4.3 +sphinx-rtd-theme>=1.0 +sphinxcontrib-bibtex>=2.4 diff --git a/requirements-tests.txt b/requirements-tests.txt new file mode 100644 index 0000000..b1f3fd4 --- /dev/null +++ b/requirements-tests.txt @@ -0,0 +1,9 @@ +ipython>=7.26 +ipdb>=0.13 +pylint>=2.9 +pytest>=6.2 +pytest-runner>=5.3 +pytest-pylint>=0.18 +coverage>=5.5 +python-igraph>=0.9 +networkx>=2.6 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b45ac7c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +# Build +setuptools>=52.0 +wheel>=0.37 +# Dependencies +numpy>=1.20 +numba>=0.51 +scipy>=1.7 +pandas>=1.3 +NEMtropy>=2.0 +statsmodels>=0.12 +tqdm>=4.62 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..2da7611 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,73 @@ +[metadata] +name = pathcensus +version = 0.1 +author = Szymon Talaga +author_email = stalaga@protonmail.com +description = + Structural similarity and complementarity coefficients for undirected networks + based on efficient counting of 2- and 3-paths and 3- and 4-cycles +long_description = file: README.rst +long_description_content_type = text/x-rst +url = https://github.com/sztal/pathcensus +project_urls = + Source = https://github.com/sztal/pathcensus + Bug Tracker = https://github.com/sztal/pathcensus/issues +classifiers = + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Natural Language :: English + License :: OSI Approved :: MIT Licences + Operating System :: OS Independent +keywords = + networks + graphs + undirected + weighted + bipartite + network science + network geometry + triples + quadruples + triangles + quadrangles + path census + motifs + relational principles + homophily + similarity + complementarity + structural equivalence + random geometric graph + latent space model + exponential random graph + ergm + +[options] +package_dir = + pathcensus = pathcensus +packages = find: +python_requires = >=3.8 +install_requires = + numpy>=1.20 + numba>=0.50 + scipy>=1.7 + pandas>=1.3 + statsmodels>=0.12 + NEMtropy>=2.0 + tqdm>=4.62 +tests_require = + ipython>=7.26 + ipdb>=0.13 + pylint>=2.9 + pytest>=6.2 + pytest-runner>=5.3 + pytest-pylint>=0.18 + coverage>=5.5 + python-igraph>=0.9 + networkx>=2.6 +test_suite = tests + +[aliases] +test = pytest diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..dbe9716 --- /dev/null +++ b/setup.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +import setuptools + +if __name__ == "__main__": + setuptools.setup() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..054e755 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,53 @@ +"""Shared configuration for unit tests.""" +import random +from itertools import product +import pytest +from tests.utils import make_er_graph, make_rgg +from tests.utils import make_triangle, make_quadrangle + + +FAMILY = ("erdos_renyi", "geometric") +VCOUNTS = (20, 100, 500) +KBARS = (2, 10) +RANDOM_SEEDS = (1, 10) +MOTIFS = ( + "triangle", + "quadrangle", +) + +_params = list(product(FAMILY, VCOUNTS, KBARS, RANDOM_SEEDS)) + +@pytest.fixture(scope="session", params=_params) +def base_random_graph(request): + """Fixture for generating multiple Erdős–Rényi or geometric + random graphs with different numbers of nodes and average degrees equal + to ``10`` or ``20``. They are automatically passed to all test methods + for testing consistency between different methods of path counting. + """ + family, n, dbar, seed = request.param + random.seed(seed) + if family == "geometric": + graph = make_rgg(n, dbar) + else: + graph = make_er_graph(n, dbar) + if dbar <= 5: + # Add isolated node with the last id + # to test degree calculations in 'Graph' class + graph.add_vertex() + return graph + + +@pytest.fixture(scope="session", params=MOTIFS) +def base_simple_motif(request): + """Fixture for generating small graphs with simple motifs + (e.g. triangles and quadrangles) for testing correctness + of path/motif counting routines and relational coefficients. + """ + motif = request.param + if motif == "triangle": + G = make_triangle() + elif motif == "quadrangle": + G = make_quadrangle(weak=0) + else: + raise ValueError(f"unknown motif '{motif}'") + return motif, G diff --git a/tests/core/test_graph.py b/tests/core/test_graph.py new file mode 100644 index 0000000..3617bfc --- /dev/null +++ b/tests/core/test_graph.py @@ -0,0 +1,30 @@ +"""Test of the graph class.""" +# pylint: disable=redefined-outer-name +import numpy as np +import pytest +from pathcensus import PathCensus +from tests.utils import add_random_weights + + +@pytest.fixture(scope="session") +def weighted_random_graph(base_random_graph): + """Get weighted random graph.""" + return add_random_weights(base_random_graph) + + +class TestGraph: + """Unit tests for ``Graph`` class used for path/cycle counting.""" + + def test_degree(self, base_random_graph): + """Test degree sequence getter.""" + G = base_random_graph + D0 = np.array(G.degree()) + D1 = PathCensus.get_graph(G).degree() + assert np.allclose(D0, D1) + + def test_strength(self, weighted_random_graph): + """Test strength sequence getter.""" + G = weighted_random_graph + S0 = np.array(G.strength(weights="weight")) + S1 = PathCensus.get_graph(G).strength() + assert np.allclose(S0, S1) diff --git a/tests/core/test_parallel.py b/tests/core/test_parallel.py new file mode 100644 index 0000000..c519128 --- /dev/null +++ b/tests/core/test_parallel.py @@ -0,0 +1,33 @@ +"""Basic tests for parallel counting algorithms.""" +# pylint: disable=redefined-outer-name +import pytest +import numpy as np +from tests.utils import make_er_graph, add_random_weights +from pathcensus import PathCensus + + +@pytest.fixture(scope="session") +def unweighted_graph(): + """Get unweighted graph.""" + return make_er_graph(10000, dbar=20) + +@pytest.fixture(scope="session") +def weighted_graph(unweighted_graph): + """Get weighted graph.""" + return add_random_weights(unweighted_graph) + + +class TestParallelPathCounting: + """Basic tests of parallelized path counting algorithm(s). + """ + def test_parallel_unweighted(self, unweighted_graph): + graph = unweighted_graph + s1 = PathCensus(graph, parallel=True).counts.values + s2 = PathCensus(graph, parallel=False).counts.values + assert np.allclose(s1, s2) + + def test_parallel_weighted(self, weighted_graph): + graph = weighted_graph + s1 = PathCensus(graph, parallel=True).counts.values + s2 = PathCensus(graph, parallel=False).counts.values + assert np.allclose(s1, s2) diff --git a/tests/test_graph.py b/tests/test_graph.py new file mode 100644 index 0000000..fa76481 --- /dev/null +++ b/tests/test_graph.py @@ -0,0 +1,28 @@ +"""Test of the graph class.""" +# pylint: disable=redefined-outer-name +import pytest +from numpy.testing import assert_array_almost_equal +from pathcensus import PathCensus +from tests.utils import add_random_weights + + +@pytest.fixture(scope="session") +def weighted_random_graph(base_random_graph): + return add_random_weights(base_random_graph) + + +class TestGraphConversion: + """Unit tests for graph converters.""" + def test_networkx_unweighted(self, base_random_graph): + G = base_random_graph # graph in `igraph` format + N = G.to_networkx() + P1 = PathCensus(G) + P2 = PathCensus(N) + assert_array_almost_equal(P1.counts.values, P2.counts.values) + + def test_networkx_weighted(self, weighted_random_graph): + G = weighted_random_graph # graph in `igraph` format + N = G.to_networkx() + P1 = PathCensus(G) + P2 = PathCensus(N) + assert_array_almost_equal(P1.counts.values, P2.counts.values) diff --git a/tests/test_inference.py b/tests/test_inference.py new file mode 100644 index 0000000..0c81380 --- /dev/null +++ b/tests/test_inference.py @@ -0,0 +1,117 @@ +"""Unit tests for :class:`pathcensus.inference.Inference`.""" +# pylint: disable=redefined-outer-name,too-many-locals +import random +from itertools import product +import pytest +import numpy as np +from pathcensus import PathCensus +from pathcensus.nullmodels import UBCM, UECM +from pathcensus.inference import Inference +from pathcensus.utils import set_seed +from tests.utils import make_er_graph, make_rgg +from tests.utils import get_largest_component, add_random_weights + +SEEDS = (20,) +WEIGHTED = (False, True) +PARAMS = list(product(SEEDS, WEIGHTED)) + +def _make_graph(func, seed, weighted=False): + random.seed(seed) + set_seed(random=seed) + graph = get_largest_component(func(50, 7)) + if weighted: + set_seed(numpy=seed) + graph = add_random_weights(graph) + model = UECM + else: + model = UBCM + model = model(graph) + model.fit() + model.validate() + return graph, model, seed + +@pytest.fixture(scope="session", params=PARAMS) +def er_graph_inference(request): + """Inference object for small ER graph.""" + seed, weighted = request.param + return _make_graph(make_er_graph, seed, weighted=weighted) + +@pytest.fixture(scope="session", params=PARAMS) +def rgg_graph_inference(request): + seed, weighted = request.param + return _make_graph(make_rgg, seed, weighted=weighted) + + +class TestInference: + """Unit test for :class:`pathcensus.inference.Inference`. + + In general the test check whether coefficients are insignificant + in ER graphs and whether similarity coefficients are significant + in RGGs. + """ + @pytest.mark.parametrize("mode", ["nodes", "global"]) + @pytest.mark.parametrize("alpha", [.01, .05]) + def test_er_pvalues(self, er_graph_inference, mode, alpha): + """Test p-values in an ER graph. + + In general it is expected that there will be no significant + values. + """ + graph, model, seed = er_graph_inference + + def stats(graph): + return PathCensus(graph).coefs(mode) + + inference = Inference(graph, model, stats) + set_seed(numba=seed) + + if mode == "nodes": + inference.aggregate_by = "units" + + n = 200 if graph.is_weighted() else 100 + data, null = inference.init_comparison(n) + + pvals = inference.estimate_pvalues(data, null, alpha=alpha) + if len(pvals) == 1: + threshold = 1 + else: + se = alpha*(1-alpha) / np.sqrt(pvals.shape[1]) + threshold = alpha + 1*se + assert (pvals.values <= alpha).mean() <= threshold + + @pytest.mark.parametrize("mode", ["nodes", "global"]) + @pytest.mark.parametrize("alpha", [.01, .05]) + def test_rgg_pvalues(self, rgg_graph_inference, mode, alpha): + graph, model, seed = rgg_graph_inference + + def stats(graph): + return PathCensus(graph).similarity(mode) + + inference = Inference(graph, model, stats) + set_seed(numba=seed) + + if mode == "nodes": + inference.aggregate_by = "units" + + n = 200 if graph.is_weighted() else 100 + data, null = inference.init_comparison(n) + + pvals = inference.estimate_pvalues(data, null, alpha=alpha) + assert (pvals <= alpha).mean() > alpha + + def test_simulate_null_with_seed(self, er_graph_inference): + """Test effect of passing random seed to :py:mod:`numba`.""" + graph, model, seed = er_graph_inference + + def stats(graph): + return PathCensus(graph).similarity("nodes", undefined="zero") + + inference = Inference(graph, model, stats) + + set_seed(numba=seed) + null1 = inference.simulate_null(100) + set_seed(numba=seed) + null2 = inference.simulate_null(100) + + assert (null1.index == null2.index).all() + assert np.allclose(null1, null2) diff --git a/tests/test_nullmodels.py b/tests/test_nullmodels.py new file mode 100644 index 0000000..b0b95ee --- /dev/null +++ b/tests/test_nullmodels.py @@ -0,0 +1,139 @@ +"""Unit tests for :mod:`pathcensus.nullmodels`.""" +# pylint: disable=redefined-outer-name +import random +from itertools import product +import pytest +import numpy as np +from pathcensus.nullmodels import UBCM, UECM +from pathcensus.utils import rowsums, set_numba_seed +from pathcensus.utils import relclose +from tests.utils import make_er_graph, make_rgg, add_random_weights +from tests.utils import get_largest_component + +FAMILY = ("erdos_renyi", "geometric") +SEEDS = (20, 40) + +_params = list(product(FAMILY, SEEDS)) +_methods = ("newton", "fixed-point") +_ubcm_params = list(product(["cm_exp", "cm"], _methods)) +_uecm_params = list(product(["ecm_exp", "ecm"], _methods)) + +@pytest.fixture(scope="session", params=_params) +def small_graph(request): + """Generate some small graphs (ER and RGG).""" + family, seed = request.param + random.seed(seed) + if family == "geometric": + graph = get_largest_component(make_rgg(50, 5)) + else: + graph = get_largest_component(make_er_graph(50, 5)) + return graph, seed + +@pytest.fixture(scope="session", params=_ubcm_params) +def small_graph_ubcm(request, small_graph): + """Generate some small graphs (ER and RGG).""" + model, method = request.param + graph, seed = small_graph + ubcm = UBCM(graph) + ubcm.fit(model, method) + return ubcm, seed, graph + +@pytest.fixture(scope="session", params=_uecm_params) +def small_graph_uecm(request, small_graph): + model, method = request.param + graph, seed = small_graph + np.random.seed(seed) + graph = add_random_weights(graph) + uecm = UECM(graph) + uecm.fit(model, method) + return uecm, seed, graph + + +class TestUBCM: + """Unit tests for Unweighted Binary Configuration Model.""" + def test_ubcm(self, small_graph_ubcm): + """Test whether the expected degree sequence in UBCM approximates + the observed sequence. + """ + ubcm, *_ = small_graph_ubcm + rtol = 1e-6 if ubcm.fit_args["method"] == "newton" else 1e-3 + + assert ubcm.is_fitted() + assert ubcm.is_valid(rtol) + + P = ubcm.get_P(dense=True) + assert relclose(P.sum(axis=1), ubcm.D, rtol=rtol) + + def test_ubcm_sampling(self, small_graph_ubcm): + """Test convergence of the average over degree sequences sampled + from UBCM towards the observed sequence. + """ + ubcm, seed, _ = small_graph_ubcm + rtol = 1e-1 if ubcm.fit_args["method"] == "newton" else 1e-1 + + D = ubcm.D + E = np.zeros_like(D, dtype=float) + n = 1000 + + set_numba_seed(seed) + + for rand in ubcm.sample(n): + E += rowsums(rand) + + E = E / n + assert relclose(D, E, rtol=rtol) + + def test_ubcm_seed(self, small_graph_ubcm): + """Test if setting random seed for sampling works correctly.""" + ubcm, seed, _ = small_graph_ubcm + + set_numba_seed(seed) + A1 = ubcm.sample_one() + set_numba_seed(seed) + A2 = ubcm.sample_one() + assert (A1 != A2).count_nonzero() == 0 + + +class TestUECM: + """Unit tests for Unweighted Enhanced Configuration Model.""" + def test_uecm(self, small_graph_uecm): + """Test whether the expected degree and strength sequences in UECM + approximate the observed sequences. + """ + uecm, *_ = small_graph_uecm + rtol = 1e-1 if uecm.fit_args["method"] == "newton" else 2e-1 + + assert uecm.is_fitted() + assert uecm.is_valid(rtol) + + P = uecm.get_P(dense=True) + W = uecm.get_W(dense=True) + assert relclose(P.sum(axis=1), uecm.D, rtol=rtol) + assert relclose(W.sum(axis=1), uecm.S, rtol=rtol) + + def test_uecm_sampling(self, small_graph_uecm): + """Test convergence of the averages over degree and strength + sequences sampled from UECM towards the observed sequences. + """ + uecm, seed, _ = small_graph_uecm + rtol = 1e-1 if uecm.fit_args["method"] == "newton" else 2e-1 + + D = uecm.D + S = uecm.S + + ED = np.zeros_like(D, dtype=float) + ES = np.zeros_like(S, dtype=float) + + n = 1000 + set_numba_seed(seed) + + for rand in uecm.sample(n): + ES += rowsums(rand) + rand.data[:] = 1 + ED += rowsums(rand) + + ED /= n + ES /= n + + assert relclose(D, ED, rtol=rtol) + assert relclose(S, ES, rtol=rtol) diff --git a/tests/unweighted/__init__.py b/tests/unweighted/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unweighted/conftest.py b/tests/unweighted/conftest.py new file mode 100644 index 0000000..11513a1 --- /dev/null +++ b/tests/unweighted/conftest.py @@ -0,0 +1,17 @@ +"""Configuration for unweighted path counting tests.""" +import pytest +from pathcensus import PathCensus + + +@pytest.fixture(scope="session") +def random_graph(base_random_graph): + """Get graph with its corresponding path census.""" + graph = base_random_graph + return graph, PathCensus(graph) + + +@pytest.fixture(scope="session") +def simple_motif(base_simple_motif): + """Get motif name and its corresponding path census.""" + motif, G = base_simple_motif + return motif, PathCensus(G) diff --git a/tests/unweighted/test_pathcensus.py b/tests/unweighted/test_pathcensus.py new file mode 100644 index 0000000..3213080 --- /dev/null +++ b/tests/unweighted/test_pathcensus.py @@ -0,0 +1,358 @@ +"""Test unweighted path counting methods.""" +# pylint: disable=redefined-outer-name,too-few-public-methods +# pylint: disable=too-many-branches +from collections import defaultdict +import pytest +from pytest import approx +import numpy as np +from pathcensus.definitions import PathDefinitionsUnweighted +from pathcensus.utils import rowsums + + +@pytest.fixture(scope="session") +def paths_edges(random_graph): + """Fixture for generating path census data frames + for edges and node/global counts based `random_graph` fixture. + """ + _, P = random_graph + E = P.census("edges") + return E, P +@pytest.fixture(scope="session") +def paths_edges_nodes(paths_edges): + """Get edge and node path/cycle counts.""" + E, P = paths_edges + return E, P.census("nodes") +@pytest.fixture(scope="session") +def paths_edges_global(paths_edges): + """Get edge and global path/cycle counts.""" + E, P = paths_edges + return E, P.census("global") + +@pytest.fixture(scope="session") +def triangle_counts(random_graph): + """Get graph, path census and triangles enumerated with :mod:`igraph`.""" + G, P = random_graph + T = np.array(G.cliques(min=3, max=3)) + return G, T, P + + +class TestPathCounting: + """Tests of different path counting methods. + + All main path counting methods are defined for overall graph counts, + node counts and node-pair (edge) counts. The below tests check whether + the results of all different counting methods are consistent in a sense + that they give the same answers after proper summing. + """ + class TestAggregationConsistency: + """Tests of aggregation consistency between edge, node + and global counts. + """ + paths = PathDefinitionsUnweighted().get_column_names() + + @pytest.mark.parametrize("path", paths) + def test_edges_to_nodes(self, path, paths_edges_nodes): + """Check consistency between edge and node counts + of paths and cycles. + """ + E, N = paths_edges_nodes + m0 = N[path].dropna() + m1 = E[path].groupby(level="i").sum() \ + .reindex(N.index) \ + .fillna(0) + + arules = PathDefinitionsUnweighted().aggregation.get("nodes", {}) + m1 /= arules.get(path, 1) + assert (m0 == m1).all() + + @pytest.mark.parametrize("path", paths) + def test_edges_to_global(self, path, paths_edges_global): + """Check consistency between edge and global counts + of paths and cycles. + """ + E, G = paths_edges_global + m0 = G[path].iloc[0] + m1 = E[path].sum() + + arules = PathDefinitionsUnweighted().aggregation.get("global", {}) + m1 /= arules.get(path, 1) + assert m0 == m1 + + + class TestSimpleMotifs: + """Test agreement with counts expected for simple motifs + such as triangle, quadrangle and star. + """ + simcoefs = ("sim_g", "sim", "tclust", "tclosure") + compcoefs = ("comp_g", "comp", "qclust", "qclosure") + + def approx_in(self, obj, vals, allow_nan=False, **kwds): + """Auxiliary method for approximate testing if + values in ``obj`` are in ``vals``. + """ + x = obj.values + l = np.zeros_like(x, dtype=bool) + for val in vals: + if allow_nan: + l |= np.isnan(x) | np.isclose(x, val, **kwds) + else: + l |= np.isclose(x, val, **kwds) + return l.all() + + @pytest.mark.parametrize("undefined", ["nan", "zero"]) + def test_simple_motifs_global(self, simple_motif, undefined): + """Check values of global structural coefficients + in simple motifs. + """ + motif, P = simple_motif + kwds = dict(undefined=undefined) + + sim = P.simcoefs("global", **kwds) + comp = P.compcoefs("global", **kwds) + + if motif == "triangle": + assert self.approx_in(sim, [1]) + assert self.approx_in(comp, [0], allow_nan=True) + + elif motif == "quadrangle": + assert self.approx_in(sim, [0]) + assert self.approx_in(comp, [1]) + + @pytest.mark.parametrize("undefined", ["nan", "zero"]) + def test_simple_motifs_nodes(self, simple_motif, undefined): + """Check values of node-wise structural coefficients + in simple motifs. + """ + motif, P = simple_motif + kwds = dict(undefined=undefined) + + sim = P.simcoefs("nodes", **kwds) + comp = P.compcoefs("nodes", **kwds) + + if motif == "triangle": + assert self.approx_in(sim, [1]) + assert self.approx_in(comp, [0], allow_nan=True) + + elif motif == "quadrangle": + assert self.approx_in(sim, [0]) + assert self.approx_in(comp, [1]) + + @pytest.mark.parametrize("undefined", ["nan", "zero"]) + def test_simple_motifs_edges(self, simple_motif, undefined): + """Check values of edge-wise structural coefficients + in simple motifs. + """ + motif, P = simple_motif + kwds = dict(undefined=undefined) + + sim = P.similarity("edges", **kwds) + comp = P.complementarity("edges", **kwds) + + if motif == "triangle": + assert self.approx_in(sim, [1]) + assert self.approx_in(comp, [0], allow_nan=True) + + elif motif == "quadrangle": + assert self.approx_in(sim, [0]) + assert self.approx_in(comp, [1]) + + class TestCountingAgainstOtherImplementations: + """Test path counting against triangle counting methods + implemented in :py:mod:`igraph` as well as naive implementations + using :py:mod:`numpy` arrays. + """ + def test_triangles_edges(self, triangle_counts): + """Test triangle counts for edges against + :py:mod:`igraph` implementation. + """ + _, T, P = triangle_counts + t1 = P.tdf["t"].to_dict() + t0 = defaultdict(lambda: 0) + for i, j, k in T: + for key in [(i, j), (i, k), (j, k)]: + u, v = key + for link in [(u, v), (v, u)]: + t0[link] += 1 + for link in t1: + if link not in t0: + t0[link] = 0 + t0 = dict(t0) + assert t0 == t1 + + def test_triangles_nodes(self, triangle_counts): + """Test triangle counts for nodes against + :py:mod:`igraph` implementation. + """ + _, T, P = triangle_counts + t1 = (P.tdf["t"].groupby(level="i").sum() // 2).to_dict() + t0 = defaultdict(lambda: 0) + for triple in T: + for i in triple: + t0[i] += 1 + for i in t1: + if i not in t0: + t0[i] = 0 + t0 = dict(t0) + assert t0 == t1 + + def test_triangles_global(self, triangle_counts): + """Test global triangle counts + against :py:mod:`igraph` implementation. + """ + _, T, P = triangle_counts + t0 = len(T) + t1 = P.tdf["t"].sum() / 6 + assert t0 == t1 + + @pytest.mark.parametrize("undefined", ["nan", "zero"]) + def test_clustering_local(self, random_graph, undefined): + """Test local clustering coefficient calculations + against the :py:mod:`igraph` implementation. + """ + G, M = random_graph + t0 = np.array(G.transitivity_local_undirected(mode=undefined)) + t1 = M.tclust(undefined=undefined).values + nan0 = np.isnan(t0) + nan1 = np.isnan(t1) + assert np.array_equal(nan0, nan1) + assert np.allclose(t0[~nan0], t1[~nan1]) + + @pytest.mark.parametrize("undefined", ["nan", "zero"]) + def test_clustering_global(self, random_graph, undefined): + """Test global clustering coefficient calculations + against the :py:mod:`igraph` implementation. + """ + G, M = random_graph + t0 = G.transitivity_undirected(mode=undefined) + t1 = M.similarity("global", undefined=undefined) + assert t0 == approx(t1) + + def test_node_paths_wedge_triples(self, random_graph): + """Test against naive :py:mod:`numpy` implementation.""" + G, M = random_graph + k = np.array(G.degree()) + t0 = k*(k-1) + t1 = M.tdf["tw"].groupby(level="i").sum() \ + .reindex(np.arange(G.vcount())) \ + .fillna(0) + assert np.array_equal(t0, t1) + + def test_node_paths_head_triples(self, random_graph): + """Test against naive :py:mod:`numpy` implementation.""" + G, M = random_graph + A = G.get_adjacency_sparse() + k = np.array(G.degree()) + t0 = A@(k-1) + t1 = M.tdf["th"].groupby(level="i").sum() \ + .reindex(np.arange(G.vcount())) \ + .fillna(0) + assert np.array_equal(t0, t1) + + def test_node_paths_wedge_quadruples(self, random_graph): + """Test against naive :py:mod:`numpy` implementation.""" + G, M = random_graph + A = G.get_adjacency_sparse() + T = (A@A).multiply(A) + k = np.array(G.degree()) + q0 = (k-1)*(A@(k-1)) - rowsums(T) + q1 = M.qdf["qw"].groupby(level="i").sum() \ + .reindex(np.arange(G.vcount())) \ + .fillna(0) + assert np.array_equal(q0, q1) + + def test_node_paths_head_quadruples(self, random_graph): + """Test against naive :py:mod:`numpy` implementation.""" + G, M = random_graph + P = M.census("edges") + A = G.get_adjacency_sparse() + T = (A@A).multiply(A) + k = np.array(G.degree()) + k2 = A@(k-1) + q0 = A@k2 - k*(k-1) - rowsums(T) + q1 = P["qh"].groupby(level="i").sum() \ + .reindex(np.arange(G.vcount())) \ + .fillna(0) + assert np.array_equal(q0, q1) + + def test_tclust(self, random_graph): + """Test against naive :py:mod:`numpy` implementation.""" + G, M = random_graph + A = G.get_adjacency_sparse() + T = (A@A).multiply(A) + k = np.array(G.degree()) + tw = k*(k-1) // 2 + t = rowsums(T) // 2 + tclust0 = np.array(G.transitivity_local_undirected(mode="zero")) + tclust1 = t / np.where(tw == 0, 1, tw) + tclust2 = M.tclust(undefined="zero").values + assert np.allclose(tclust0, tclust1) and np.allclose(tclust1, tclust2) + + def test_tclosure(self, random_graph): + """Test against naive :py:mod:`numpy` implementation.""" + G, M = random_graph + A = G.get_adjacency_sparse() + T = (A@A).multiply(A) + k = np.array(G.degree()) + th = A@(k-1) + t = rowsums(T) // 2 + tclo0 = 2*t / np.where(th == 0, 1, th) + tclo1 = M.tclosure(undefined="zero").values + assert np.allclose(tclo0, tclo1) + + def test_similarity(self, random_graph): + """Test against naive :py:mod:`numpy` implementation.""" + G, M = random_graph + A = G.get_adjacency_sparse() + T = (A@A).multiply(A) + k = np.array(G.degree()) + tw = k*(k-1) + th = A@(k-1) + t = rowsums(T) // 2 + tclust = 2*t / np.where(tw == 0, 1, tw) + tclo = 2*t / np.where(th == 0, 1, th) + with np.errstate(invalid="ignore", divide="ignore"): + sim0 = (tw*tclust + th*tclo) / (tw + th) + sim0[np.isnan(sim0)] = 0 + sim1 = M.similarity("nodes", undefined="zero") + assert np.allclose(sim0, sim1) + + + class TestConsistencyBounds: + """Test consistency in terms of bounds between open + and closed paths. In particular, closed paths (e.g. triangles) + cannot be more frequent than their open counterparts. + Moreover, relational coefficients (similarity and complementarity) + must be bounded between their min/max of their corresponding + clustering and closure coefficients. + """ + @pytest.mark.parametrize("mode", ["edges", "nodes", "global"]) + def test_path_counts_consistency(self, random_graph, mode): + _, P = random_graph + P = P.census(mode) + assert (P.values >= 0).all() + assert (P["t"] <= P["tw"]).all() + assert (P["t"] <= P["th"]).all() + assert (P["q0"] <= P["qw"]).all() + assert (P["q0"] <= P["qh"]).all() + + @pytest.mark.parametrize("mode", ["edges", "nodes", "global"]) + def test_similarity_coefs_consistency(self, random_graph, mode): + _, P = random_graph + C = P.coefs(mode).dropna() + vals = C.values + assert (vals >= -1e-6).all() and (vals <= 1+1e-6).all() + if mode == "nodes": + m0 = C[["tclust", "tclosure"]].min(axis=1) + m1 = C[["tclust", "tclosure"]].max(axis=1) + assert (C["sim"].between(m0, m1)).all() + + @pytest.mark.parametrize("mode", ["edges", "nodes", "global"]) + def test_complementarity_coefs_consistency(self, random_graph, mode): + _, P = random_graph + C = P.coefs(mode).dropna() + vals = C.values + assert (vals >= -1e-6).all() and (vals <= 1+1e-6).all() + if mode == "nodes": + m0 = C[["qclust", "qclosure"]].min(axis=1) + m1 = C[["qclust", "qclosure"]].max(axis=1) + assert (C["comp"].between(m0, m1)).all() diff --git a/tests/unweighted/test_theory.py b/tests/unweighted/test_theory.py new file mode 100644 index 0000000..04ed2a8 --- /dev/null +++ b/tests/unweighted/test_theory.py @@ -0,0 +1,76 @@ +"""Tests of theoretical results.""" +# pylint: disable=redefined-outer-name,cyclic-import +import pytest +import numpy as np +from pathcensus import PathCensus +from tests.utils import get_largest_component + + +@pytest.fixture(scope="session") +def random_graph_connected(random_graph): + G, _ = random_graph + G = get_largest_component(G) + P = PathCensus(G) + return G, P + + +class TestTheory: + """Test various theoretical results concerning + local similarity and complementarity coefficients. + """ + @staticmethod + def weighted_average(x, w): + m = np.isnan(x) + if m.all(): + return 0 + x = x[~m] + w = w[~m] + return (x * w).sum() / w.sum() + + def test_similarity_node_edge_sum(self, random_graph_connected): + """Test whether node similarity is a weighted average + of corresponding edge similarities. + """ + _, P = random_graph_connected + edge = P.simcoefs("edges", census=True, undefined="nan") \ + .groupby(level="i") \ + .apply(lambda df: \ + self.weighted_average(df["sim"], df["tw"] + df["th"]) + ) + node = P.similarity("nodes", undefined="zero") + assert np.allclose(edge, node) + + def test_similarity_node_edge_minmax_bounds(self, random_graph_connected): + """Test whether node similarity is bounded between + minimum and maximum edge similarity. + """ + _, P = random_graph_connected + gdf = P.similarity("edges").groupby(level="i").agg([min, max]) + s_node = P.similarity("nodes", undefined="undefined") + s_emin = gdf["min"] + s_emax = gdf["max"] + assert s_node.between(s_emin, s_emax).all() + + def test_complementarity_node_edge_sum(self, random_graph_connected): + """Test whether node complementarity is a weighted average + of corresponding edge complementairyt coefficients. + """ + _, P = random_graph_connected + edge = P.compcoefs("edges", census=True, undefined="nan") \ + .groupby(level="i") \ + .apply(lambda df: \ + self.weighted_average(df["comp"], df["qw"] + df["qh"]) + ) + node = P.complementarity("nodes", undefined="zero") + assert np.allclose(edge, node) + + def test_complementarity_node_edge_minmax_bounds(self, random_graph_connected): + """Test whether node complementarity is bounded between + minimum and maximum edge complementarity. + """ + _, P = random_graph_connected + gdf = P.complementarity("edges").groupby(level="i").agg([min, max]) + c_node = P.complementarity("nodes", undefined="zero") + c_emin = gdf["min"] + c_emax = gdf["max"] + assert c_node.between(c_emin, c_emax).all() diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..c2bbece --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,65 @@ +"""Test utilities.""" +from typing import Union, Any +import numpy as np +from scipy.sparse import spmatrix, isspmatrix +import igraph as ig + + +def rowsums(X: Union[np.ndarray, spmatrix]) -> np.ndarray: + """Calculate row sums of a matrix.""" + if isspmatrix(X): + return np.array(X.sum(1)).flatten() + return X.sum(1) + +def get_largest_component(graph: ig.Graph, **kwds: Any) -> ig.Graph: + """Get largest component of a graph. + + ``**kwds`` are passed to :py:meth:`igraph.Graph.components`. + """ + vids = None + for component in graph.components(**kwds): + if vids is None or len(component) > len(vids): + vids = component + return graph.induced_subgraph(vids) + +# Random graphs --------------------------------------------------------------- + +def make_er_graph(n, dbar): + """Make ER random graph with given average degree.""" + p = dbar / (n-1) + return ig.Graph.Erdos_Renyi(n, p=p, directed=False) + +def make_rgg(n, dbar): + """Make random geometric graph with given average degree.""" + radius = np.sqrt(dbar/(np.pi*(n-1))) + return ig.Graph.GRG(n, radius=radius, torus=True) + +def add_random_weights(graph, m0=1, m1=10): + """Add random integer weights between ``m0`` and ``m1`` + to a :py:class:`igraph.Graph` instance. + """ + graph = graph.copy() + graph.es["weight"] = np.random.randint(m0, m1, (graph.ecount(),)) + return graph + +# Motifs ---------------------------------------------------------------------- + +def make_triangle(): + """Make a simple triangle graph (undirected).""" + G = ig.Graph(directed=False) + G.add_vertices(3) + G.add_edges([(0, 1), (1, 2), (2, 0)]) + return G + +def make_quadrangle(weak=0): + """Make a simple quadrangle graph (undirected) + with the number of chords equal to ``weak``. + """ + G = ig.Graph(directed=False) + G.add_vertices(4) + G.add_edges([(0, 1), (1, 2), (2, 3), (3, 0)]) + if weak >= 1: + G.add_edge(0, 2) + if weak == 2: + G.add_edge(1, 3) + return G diff --git a/tests/weighted/__init__.py b/tests/weighted/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/weighted/conftest.py b/tests/weighted/conftest.py new file mode 100644 index 0000000..5490327 --- /dev/null +++ b/tests/weighted/conftest.py @@ -0,0 +1,27 @@ +"""Configuration for weighted path counting tests.""" +# pylint: disable=cyclic-import +import pytest +import numpy as np +from pathcensus import PathCensus +from tests.utils import add_random_weights + +_SEEDS = (324, 7171) + +@pytest.fixture(scope="session", params=_SEEDS) +def random_graph(base_random_graph, request): + """Get graph with its corresponding path census.""" + seed = request.param + graph = base_random_graph + np.random.seed(seed) + graph = add_random_weights(graph) + return graph, PathCensus(graph) + + +@pytest.fixture(scope="session", params=_SEEDS) +def simple_motif(base_simple_motif, request): + """Get motif name and its corresponding path census.""" + seed = request.param + motif, G = base_simple_motif + np.random.seed(seed) + G = add_random_weights(G) + return motif, PathCensus(G) diff --git a/tests/weighted/test_pathcensus.py b/tests/weighted/test_pathcensus.py new file mode 100644 index 0000000..0d8d86d --- /dev/null +++ b/tests/weighted/test_pathcensus.py @@ -0,0 +1,276 @@ +"""Test weighted path counting methods.""" +# pylint: disable=redefined-outer-name,too-few-public-methods +# pylint: disable=too-many-branches +import pytest +from pytest import approx +import numpy as np +import pandas as pd +from pathcensus.definitions import PathDefinitionsWeighted +from pathcensus import PathCensus + + +@pytest.fixture(scope="session") +def paths_edges(random_graph): + """Fixture for generating path census data frames + for edges and node/global counts based `random_graph` fixture. + """ + _, S = random_graph + E = S.census("edges") + return E, S +@pytest.fixture(scope="session") +def paths_edges_nodes(paths_edges): + """Get edge and node path/cycle counts.""" + E, S = paths_edges + return E, S.census("nodes") +@pytest.fixture(scope="session") +def paths_edges_global(paths_edges): + """Get edge and global path/cycle counts.""" + E, S = paths_edges + return E, S.census("global") + +@pytest.fixture(scope="session") +def graph_weights_one(random_graph): + """Pair of :py:class:`pathcensus.PathCensus` objects for weighted and + unweighted version of the same graph with all weights equal to ``1``. + """ + G, _ = random_graph + G.es["weight"] = np.ones((G.ecount(),)) + P0 = PathCensus(G, weighted=False) + P1 = PathCensus(G, weighted=True) + return P0, P1 + +@pytest.fixture(scope="session") +def graph_weights_uniform(random_graph): + """Pair of :py:class:`pathcensus.PathCensus` objects for weighted and + unweighted version of the same graph with all weights being uniform + but other than ``1``. + """ + G, _ = random_graph + G.es["weight"] = 3*np.ones((G.ecount(),)) + P0 = PathCensus(G, weighted=False) + P1 = PathCensus(G, weighted=True) + return P0, P1 + + +class TestPathCounting: + """Tests of different path counting methods. + + All main path counting methods are defined for overall graph counts, + node counts and node-pair (edge) counts. The below tests check whether + the results of all different counting methods are consistent in a sense + that they give the same answers after proper summing. + """ + class TestAggregationConsistency: + """Tests of aggregation consistency between edge, node + and global counts. + """ + paths = PathDefinitionsWeighted().get_column_names() + + @pytest.mark.parametrize("path", paths) + def test_edges_to_nodes(self, path, paths_edges_nodes): + """Check consistency between edge and node counts + of paths and cycles. + """ + E, N = paths_edges_nodes + m0 = N[path].dropna() + m1 = E[path].groupby(level="i").sum() \ + .reindex(N.index) \ + .fillna(0) + + arules = PathDefinitionsWeighted().aggregation.get("nodes", {}) + m1 /= arules.get(path, 1) + assert np.allclose(m0, m1) + + @pytest.mark.parametrize("path", paths) + def test_edges_to_global(self, path, paths_edges_global): + """Check consistency between edge and global counts + of paths and cycles. + """ + E, G = paths_edges_global + m0 = G[path].iloc[0] + m1 = E[path].sum() + + arules = PathDefinitionsWeighted().aggregation.get("global", {}) + m1 /= arules.get(path, 1) + assert m0 == approx(m1) + + + class TestCountingAgainstOtherImplementations: + """Test weighted path counting against mean weighted local + clustering coefficient as defined by Barrat et al. + and implemented in :py:mod:`igraph`. + + In general, weighted `t`-clustering should be equal to + the method by Barrat et al. + """ + @pytest.mark.parametrize("undefined", ["nan", "zero"]) + def test_mean_local_clustering(self, random_graph, undefined): + G, P = random_graph + c0 = G.transitivity_avglocal_undirected(weights="weight", mode=undefined) + c1 = P.tclust(undefined=undefined).mean(skipna=False) + assert np.isnan([c0, c1]).all() or c0 == approx(c1) + + + class TestConsistencyBounds: + """Test consistency in terms of bounds between open + and closed paths. In particular, closed paths (e.g. triangles) + cannot be more frequent than their open counterparts. + Moreover, relational coefficients (similarity and complementarity) + must be bounded between their min/max of their corresponding + clustering and closure coefficients. + """ + @pytest.mark.parametrize("mode", ["edges", "nodes", "global"]) + def test_path_counts_consistency(self, random_graph, mode): + _, P = random_graph + C = P.census(mode) + tol = 1e-6 + assert (C.values >= 0).all() + assert (C["twc"] <= C["tw"] + tol).all() + assert (C["thc"] <= C["th"] + tol).all() + assert (C["q0wc"] <= C["qw"] + tol).all() + assert (C["q0hc"] <= C["qh"] + tol).all() + + @pytest.mark.parametrize("mode", ["edges", "nodes", "global"]) + def test_similarity_coefs_consistency(self, random_graph, mode): + _, P = random_graph + C = P.coefs(mode).dropna() + vals = C.values + assert (vals >= -1e-6).all() and (vals <= 1+1e-6).all() + if mode == "nodes": + m0 = C[["tclust", "tclosure"]].min(axis=1) + m1 = C[["tclust", "tclosure"]].max(axis=1) + assert (C["sim"].between(m0, m1)).all() + + @pytest.mark.parametrize("mode", ["edges", "nodes", "global"]) + def test_complementarity_coefs_consistency(self, random_graph, mode): + _, P = random_graph + C = P.coefs(mode).dropna() + vals = C.values + assert (vals >= -1e-6).all() and (vals <= 1+1e-6).all() + if mode == "nodes": + m0 = C[["qclust", "qclosure"]].min(axis=1) + m1 = C[["qclust", "qclosure"]].max(axis=1) + assert (C["comp"].between(m0, m1)).all() + + + class TestConsistencyWithUnweightedMethods: + """Test whether weighted counts with uniform weights + are consistent with the unweighted counts etc. + """ + @staticmethod + def to_unweighted(df): + """Combine weighted counts so they have the same columns + as unweighted counts. + """ + return pd.DataFrame({ + "t": (df["twc"] + df["thc"]) / 2, + "tw": df["tw"], + "th": df["th"], + "q0": (df["q0wc"] + df["q0hc"]) / 2, + "qw": df["qw"], + "qh": df["qh"] + }) + + @pytest.mark.parametrize("mode", ["edges", "nodes", "global"]) + def test_path_counts_consistency(self, graph_weights_one, mode): + """Test consistency of path counts.""" + P0, P1 = graph_weights_one + assert P1.weighted + p0 = P0.census(mode) + p1 = self.to_unweighted(P1.census(mode)) + assert np.allclose(p0.values, p1.values) + + @pytest.mark.parametrize("mode", ["edges", "nodes", "global"]) + def test_coefs_consistency(self, graph_weights_uniform, mode): + """Test consistency of coefficients.""" + P0, P1 = graph_weights_uniform + assert P1.weighted + c0 = P0.coefs(mode, undefined="zero") + c1 = P1.coefs(mode, undefined="zero") + assert np.allclose(c0.values, c1.values) + + + class TestSimpleMotifs: + """Test agreement with counts expected for simple motifs + such as triangle, quadrangle and star. + """ + simcoefs = ("sim_g", "sim", "tclust", "tclosure") + compcoefs = ("comp_g", "comp", "qclust", "qclosure") + + def approx_in(self, obj, vals, allow_nan=False, **kwds): + """Auxiliary method for approximate testing if + values in ``objs`` are in ``vals``. + """ + x = obj.values + l = np.zeros_like(x, dtype=bool) + for val in vals: + if allow_nan: + l |= np.isnan(x) | np.isclose(x, val, **kwds) + else: + l |= np.isclose(x, val, **kwds) + return l.all() + + def approx_between(self, obj, lo, hi, allow_nan=False, tol=1e-6): + """Auxiliary method for approximate testing if + valuesin ``obj`` are between ``lo`` and ``hi``. + """ + x = obj.values + l = np.isnan(x) if allow_nan else np.zeros_like(x, dtype=bool) + return (l | (x >= lo-tol) | (x <= hi+tol)).all() + + @pytest.mark.parametrize("undefined", ["nan", "zero"]) + def test_simple_motifs_global(self, simple_motif, undefined): + """Check values of global structural coefficients + in simple motifs. + """ + motif, P = simple_motif + kwds = dict(undefined=undefined) + + sim = P.simcoefs("global", **kwds) + comp = P.compcoefs("global", **kwds) + + if motif == "triangle": + assert self.approx_in(sim, [1]) + assert self.approx_in(comp, [0], allow_nan=True) + + elif motif == "quadrangle": + assert self.approx_in(sim, [0]) + assert self.approx_in(comp, [1]) + + @pytest.mark.parametrize("undefined", ["nan", "zero"]) + def test_simple_motifs_nodes(self, simple_motif, undefined): + """Check values of node-wise structural coefficients + in simple motifs. + """ + motif, P = simple_motif + kwds = dict(undefined=undefined) + + sim = P.simcoefs("nodes", **kwds) + comp = P.compcoefs("nodes", **kwds) + + if motif == "triangle": + assert self.approx_in(sim, [1]) + assert self.approx_in(comp, [0], allow_nan=True) + + elif motif == "quadrangle": + assert self.approx_in(sim, [0]) + assert self.approx_in(comp, [1]) + + @pytest.mark.parametrize("undefined", ["nan", "zero"]) + def test_simple_motifs_edges(self, simple_motif, undefined): + """Check values of edge-wise structural coefficients + in simple motifs. + """ + motif, P = simple_motif + kwds = dict(undefined=undefined) + + sim = P.similarity("edges", **kwds) + comp = P.complementarity("edges", **kwds) + + if motif == "triangle": + assert self.approx_in(sim, [1]) + assert self.approx_in(comp, [0], allow_nan=True) + + elif motif == "quadrangle": + assert self.approx_in(sim, [0]) + assert self.approx_in(comp, [1]) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..0ba3800 --- /dev/null +++ b/tox.ini @@ -0,0 +1,35 @@ +[tox] +envlist = py38, py39, py310, style, coverage +requires = tox-conda + +[testenv] +setenv = + PYTHONPATH = {toxinidir}:{toxinidir} +deps = + -r{toxinidir}/requirements.txt + -r{toxinidir}/requirements-tests.txt +commands = + pytest --basetemp={envtmpdir} + +[testenv:coverage] +deps = + -r{toxinidir}/requirements.txt + -r{toxinidir}/requirements-tests.txt +commands = + coverage run --source pathcensus -m pytest + coverage report --omit=pathcensus/core/*.py + +[testenv:style] +deps = + -r{toxinidir}/requirements.txt + -r{toxinidir}/requirements-tests.txt +commands = + pytest --pylint -m pylint + +# [testenv:docs] +# changedir=docs/ +# deps = +# -r{toxinidir}/requirements-docs.txt +# commands = +# sphinx-build -b linkcheck ./ _build/ +# sphinx-build -b html ./ _build/