diff --git a/.github/DOCS.md b/.github/DOCS.md new file mode 100644 index 0000000..c740c32 --- /dev/null +++ b/.github/DOCS.md @@ -0,0 +1,32 @@ +# Github config and workflows + +In this folder there is configuration for codecoverage, dependabot and ci workflows. + +This folder can be merged using a `--allow-unrelated-histories` merge strategy from which provides a reasonably sensible base for writing your own ci on. By using this strategy the history of the CI repo is included in your repo, and future updates to the CI can be merged later. + +The workflows in this folder requires a root Makefile with a couple of targets defined. +As base can the Makefile in be used. + +## Publish + +The `publish`-step in [test.yml](./workflows/test.yml) is configured to use the GitHub environment `release`, create that or change to your preferred environment. +To publish to PyPI you must also configure your Pypi-project settings to use Trusted Publisher Management, by setting repo, workflow and environment on PyPI. + +To perform this merge run: + +```shell +git remote add ci git@github.com:spraakbanken/python-pdm-ci-conf.git +git fetch ci +git merge --allow-unrelated-histories ci/main +``` + +or add the remote as `git remote add ci https://github.com/spraakbanken/python-pdm-ci-conf.git` + +To later merge updates to this repo, just run: + +```shell +git fetch ci +get merge ci/main +``` + +This setup is inspired by . diff --git a/.github/codecov.yml b/.github/codecov.yml new file mode 100644 index 0000000..cd5ce8f --- /dev/null +++ b/.github/codecov.yml @@ -0,0 +1,21 @@ +# ref: https://docs.codecov.com/docs/codecovyml-reference +coverage: + # Hold ourselves to a high bar + range: 85..100 + round: down + precision: 1 + status: + # ref: https://docs.codecov.com/docs/commit-status + project: + default: + # Avoid false negatives + threshold: 1% + +# Test files aren't important for coverage +ignore: + - "tests" + +# Make comments less noisy +comment: + layout: "files" + require_changes: true diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..aa363d3 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,25 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + # Maintain dependencies for GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "daily" + ignore: + - dependency-name: "*" + # patch and minor updates don't matter for libraries as consumers of this library build + # with their own lockfile, rather than the version specified in this library's lockfile + # remove this ignore rule if your package has binaries to ensure that the binaries are + # built with the exact set of dependencies and those are up to date. + update-types: + - "version-update:semver-patch" + - "version-update:semver-minor" diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml new file mode 100644 index 0000000..e9cb537 --- /dev/null +++ b/.github/workflows/check.yml @@ -0,0 +1,111 @@ +name: check + +on: + push: + branches: + - main + pull_request: + merge_group: + +permissions: + contents: read + +env: + MINIMUM_PYTHON_VERSION: "3.8" + +# If new code is pushed to a PR branch, then cancel in progress workflows for that PR. Ensures that +# we don't waste CI time, and returns results quicker https://github.com/jonhoo/rust-ci-conf/pull/5 +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + fmt: + runs-on: ubuntu-latest + name: ubuntu / 3.8 / fmt + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Set up the python ${{ env.MINIMUM_PYTHON_VERSION }} + uses: pdm-project/setup-pdm@v4 + id: setup-python + with: + python-version: ${{ env.MINIMUM_PYTHON_VERSION }} + + - name: Load cached venv + id: cached-venv + uses: actions/cache@v4 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/pdm.lock') }}-${{ hashFiles('.github/workflows/check.yml') }} + + - name: Install dependencies + if: steps.cached-venv.outputs.cache-hit != 'true' + run: make install-dev + + - name: check formatting + run: make check-fmt + lint: + runs-on: ubuntu-latest + name: ubuntu / 3.8 / lint + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Set up the python ${{ env.MINIMUM_PYTHON_VERSION }} + uses: pdm-project/setup-pdm@v4 + id: setup-python + with: + python-version: ${{ env.MINIMUM_PYTHON_VERSION }} + - name: Load cached venv + id: cached-venv + uses: actions/cache@v4 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/pdm.lock') }}-${{ hashFiles('.github/workflows/check.yml') }} + - name: Install dependencies + if: steps.cached-venv.outputs.cache-hit != 'true' + run: make install-dev + - name: lint code + run: make lint + type-check: + runs-on: ubuntu-latest + name: ubuntu / 3.8 / type-check + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Set up the python ${{ env.MINIMUM_PYTHON_VERSION }} + uses: pdm-project/setup-pdm@v4 + id: setup-python + with: + python-version: ${{ env.MINIMUM_PYTHON_VERSION }} + - name: Load cached venv + id: cached-venv + uses: actions/cache@v4 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/pdm.lock') }}-${{ hashFiles('.github/workflows/check.yml') }} + - name: Install dependencies + if: steps.cached-venv.outputs.cache-hit != 'true' + run: make install-dev + - name: type-check code + run: make type-check + + # https://github.com/marketplace/actions/alls-green#why used for branch protection checks + check-check: + if: always() + needs: + - fmt + - lint + - type-check + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Decide whether the needed jobs succeeded or failed + uses: re-actors/alls-green@release/v1 + with: + jobs: ${{ toJSON(needs) }} + allowed-failures: upload-coverage diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..e37fb25 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,138 @@ +name: release + +on: + push: + branches: + - main + tags: + - 'v[0-9]+.[0-9]+.[0-9]+' + pull_request: + merge_group: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true +permissions: + contents: read + +env: + MINIMUM_PYTHON_VERSION: "3.8" + +jobs: + build: + # This action builds distribution files for upload to PyPI + + name: ubuntu / 3.8 / build + runs-on: ubuntu-latest + steps: + #---------------------------------------------- + # check-out repo and set-up python + #---------------------------------------------- + - name: Check out repository + uses: actions/checkout@v4 + with: + submodules: true + + #---------------------------------------------- + # ----- setup python ----- + #---------------------------------------------- + - name: Set up the environment + uses: pdm-project/setup-pdm@v4 + id: setup-python + with: + python-version: ${{ env.MINIMUM_PYTHON_VERSION }} + + #---------------------------------------------- + # ----- build distribution ----- + #---------------------------------------------- + - name: Build distribution + run: make build + + #---------------------------------------------- + # ----- upload artifacts ----- + #---------------------------------------------- + - uses: actions/upload-artifact@v4 + with: + name: pypi_files + path: dist + + test-build: + # This action runs the test suite on the built artifact in the `build` action. + # The default is to run this in ubuntu, macos and windows + + name: ${{ matrix.os }} / 3.8 / test built artifact + needs: [build] + + strategy: + fail-fast: false + matrix: + os: + - ubuntu + - macos + - windows + + runs-on: ${{ matrix.os }}-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - name: set up python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.MINIMUM_PYTHON_VERSION }} + + - name: get dist artifacts + uses: actions/download-artifact@v4 + with: + name: pypi_files + path: dist + + - run: rm -r src/parallel_corpus + - run: pip install typing-extensions + - run: pip install -r tests/requirements-testing.lock + - run: pip install parallel-corpus --no-index --no-deps --find-links dist --force-reinstall + - run: pytest + + # https://github.com/marketplace/actions/alls-green#why used for branch protection checks + release-check: + if: always() + needs: + - build + - test-build + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Decide whether the needed jobs succeeded or failed + uses: re-actors/alls-green@release/v1 + with: + jobs: ${{ toJSON(needs) }} + # allowed-failures: coverage + + publish: + # This action publishes the built and tested artifact to PyPI, but only on a tag + + needs: + - test-build + if: success() && startsWith(github.ref, 'refs/tags/v') + runs-on: ubuntu-latest + environment: release + permissions: + # IMPORTANT: this permission is mandatory for trusted publishing + id-token: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Set up Python ${{ env.MINIMUM_PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.MINIMUM_PYTHON_VERSION }} + - name: get dist artifacts + uses: actions/download-artifact@v4 + with: + name: pypi_files + path: dist + + - name: Publish package to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml new file mode 100644 index 0000000..48b4288 --- /dev/null +++ b/.github/workflows/scheduled.yml @@ -0,0 +1,74 @@ +# Run scheduled (rolling) jobs on a nightly basis, as your crate may break independently of any +# given PR. E.g., updates to rust nightly and updates to this crates dependencies. See check.yml for +# information about how the concurrency cancellation and workflow triggering works +permissions: + contents: read + +on: + push: + branches: [main] + pull_request: + schedule: + - cron: '7 7 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +name: rolling + +jobs: + # https://twitter.com/mycoliza/status/1571295690063753218 + nightly: + runs-on: ubuntu-latest + name: ubuntu / 3.13-dev + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install pdm + uses: pdm-project/setup-pdm@v4 + with: + python-version: "3.11" + - name: Install python + uses: actions/setup-python@v5 + with: + python-version: "3.13-dev" + - run: python --version + - name: pdm lock + if: hashFiles('pdm.lock') == '' + run: pdm lock + - name: pdm sync --dev + run: pdm sync --dev + - name: make test + run: make test + # https://twitter.com/alcuadrado/status/1571291687837732873 + update: + # This action checks that updating the dependencies of this crate to the latest available that + # satisfy the versions in Cargo.toml does not break this crate. This is important as consumers + # of this crate will generally use the latest available crates. This is subject to the standard + # Cargo semver rules (i.e cargo does not update to a new major version unless explicitly told + # to). + runs-on: ubuntu-latest + name: ubuntu / 3.12 / updates work + # There's no point running this if no Cargo.lock was checked in in the first place, since we'd + # just redo what happened in the regular test job. Unfortunately, hashFiles only works in if on + # steps, so we repeat it. + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install 3.12 + if: hashFiles('pdm.lock') != '' + uses: pdm-project/setup-pdm@v4 + with: + python-version: "3.12" + - name: pdm update + if: hashFiles('pdm.lock') != '' + run: pdm update + - name: pdm sync --dev + if: hashFiles('pdm.lock') != '' + run: pdm sync --dev + - name: make test + if: hashFiles('pdm.lock') != '' + run: make test diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..1a9978d --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,180 @@ +name: test + +on: + push: + branches: + - main + pull_request: + merge_group: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read + +env: + MINIMUM_PYTHON_VERSION: "3.8" + +jobs: + + coverage: + # This action runs tests for coverage collection and uploads them to codecov.io. + # This requires the secret `CODECOV_TOKEN` be set as secret on GitHub, both for + # Actions and Dependabot + + name: "${{ matrix.os }} / ${{ matrix.python-version }} / coverage" + strategy: + max-parallel: 4 + fail-fast: false + matrix: + os: [ubuntu] + python-version: + # remove the unused versions + - "3.8" + - "3.9" + - "3.10" + - "3.11" + - "3.12" + + runs-on: ${{ matrix.os }}-latest + env: + OS: ${{ matrix.os }}-latest + PYTHON: ${{ matrix.python-version }} + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Set up the environment + uses: pdm-project/setup-pdm@v4 + id: setup-python + with: + python-version: ${{ matrix.python-version }} + + - name: Load cached venv + id: cached-venv + uses: actions/cache@v4 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/pdm.lock') }}-${{ hashFiles('.github/workflows/test.yml') }} + + - name: Install dependencies + if: steps.cached-venv.outputs.cache-hit != 'true' + run: make install-dev + + - name: Run tests for coverage + run: make test-w-coverage cov_report=xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + # directory: ./coverage + env_vars: OS,PYTHON + fail_ci_if_error: true + # files: ./coverage/coverage.xml + # flags: unittests + # name: codecov-umbrella + verbose: true + + doctests: + # This action runs doctests for coverage collection and uploads them to codecov.io. + # This requires the secret `CODECOV_TOKEN` be set as secret on GitHub, both for + # Actions and Dependabot + + name: "${{ matrix.os }} / 3.8 / doctest" + strategy: + max-parallel: 4 + fail-fast: false + matrix: + os: [ubuntu] + + runs-on: ${{ matrix.os }}-latest + env: + OS: ${{ matrix.os }}-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Set up the environment + uses: pdm-project/setup-pdm@v4 + id: setup-python + with: + python-version: ${{ env.MINIMUM_PYTHON_VERSION }} + + - name: Load cached venv + id: cached-venv + uses: actions/cache@v4 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/pyproject.toml') }}-${{ hashFiles('**/pdm.lock') }}-${{ hashFiles('.github/workflows/test.yml') }} + + - name: Install dependencies + if: steps.cached-venv.outputs.cache-hit != 'true' + run: make install-dev + #---------------------------------------------- + # Run tests and upload coverage + #---------------------------------------------- + - name: make doc-tests + run: make doc-tests cov_report=xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + # directory: ./coverage + env_vars: OS,PYTHON,TESTTYPE + fail_ci_if_error: true + # files: ./coverage/coverage.xml + # flags: unittests + # name: codecov-umbrella + verbose: true + env: + PYTHON: ${{ env.MINIMUM_PYTHON_VERSION }} + TESTTYPE: doctest + + minimal: + # This action chooses the oldest version of the dependencies permitted by Cargo.toml to ensure + # that this crate is compatible with the minimal version that this crate and its dependencies + # require. This will pickup issues where this create relies on functionality that was introduced + # later than the actual version specified (e.g., when we choose just a major version, but a + # method was added after this version). + # + + runs-on: ubuntu-latest + name: ubuntu / 3.8 / minimal-versions + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Set up the environment + uses: pdm-project/setup-pdm@v4 + id: setup-python + with: + python-version: ${{ env.MINIMUM_PYTHON_VERSION }} + + - name: pdm lock --strategy direct_minimal_versions + run: pdm lock --strategy direct_minimal_versions + - name: pdm sync --dev + run: pdm sync --dev + - name: make test + run: make test + + # https://github.com/marketplace/actions/alls-green#why used for branch protection checks + test-check: + if: always() + needs: + - coverage + - doctests + - minimal + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Decide whether the needed jobs succeeded or failed + uses: re-actors/alls-green@release/v1 + with: + jobs: ${{ toJSON(needs) }} + diff --git a/.gitignore b/.gitignore index 68bc17f..04f1bf2 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,4 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ +.pdm-python diff --git a/.sourcery.yaml b/.sourcery.yaml new file mode 100644 index 0000000..0258fc3 --- /dev/null +++ b/.sourcery.yaml @@ -0,0 +1,8 @@ +rule_settings: + enable: + - default + rule_types: + - refactoring + - suggestion + - comment + python_version: "3.8" diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..31f9d95 --- /dev/null +++ b/Makefile @@ -0,0 +1,165 @@ + +# use this Makefile as base in your project by running +# git remote add make https://github.com/spraakbanken/python-pdm-make-conf +# git fetch make +# git merge --allow-unrelated-histories make/main +# +# To later update this makefile: +# git fetch make +# git merge make/main +# +.default: help + +.PHONY: help +help: + @echo "usage:" + @echo "dev | install-dev" + @echo " setup development environment" + @echo "install" + @echo " setup production environment" + @echo "" + @echo "info" + @echo " print info about the system and project" + @echo "" + @echo "test" + @echo " run all tests" + @echo "" + @echo "test-w-coverage [cov=] [cov_report=]" + @echo " run all tests with coverage collection. (Default: cov_report='term-missing', cov='--cov=${PROJECT_SRC}')" + @echo "" + @echo "lint" + @echo " lint the code" + @echo "" + @echo "lint-fix" + @echo " lint the code and try to fix it" + @echo "" + @echo "type-check" + @echo " check types" + @echo "" + @echo "fmt" + @echo " format the code" + @echo "" + @echo "check-fmt" + @echo " check that the code is formatted" + @echo "" + @echo "bumpversion [part=]" + @echo " bumps the given part of the version of the project. (Default: part='patch')" + @echo "" + @echo "bumpversion-show" + @echo " shows the bump path that is possible" + @echo "" + @echo "publish [branch=]" + @echo " pushes the given branch including tags to origin, for CI to publish based on tags. (Default: branch='main')" + @echo " Typically used after 'make bumpversion'" + @echo "" + @echo "prepare-release" + @echo " run tasks to prepare a release" + @echo "" + +PLATFORM := `uname -o` +REPO := "graph-py" +PROJECT_SRC := "src/parallel_corpus" + +ifeq (${VIRTUAL_ENV},) + VENV_NAME = .venv + INVENV = pdm run +else + VENV_NAME = ${VIRTUAL_ENV} + INVENV = +endif + +default_cov := "--cov=${PROJECT_SRC}" +cov_report := "term-missing" +cov := ${default_cov} + +all_tests := tests +tests := tests + +info: + @echo "Platform: ${PLATFORM}" + @echo "INVENV: '${INVENV}'" + +dev: install-dev + +# setup development environment +install-dev: + pdm install --dev + +# setup production environment +install: + pdm sync --prod + +lock: pdm.lock + +pdm.lock: pyproject.toml + pdm lock + +.PHONY: test +test: + ${INVENV} pytest -vv ${tests} + +.PHONY: test-w-coverage +# run all tests with coverage collection +test-w-coverage: + ${INVENV} pytest -vv ${cov} --cov-report=${cov_report} ${all_tests} + +.PHONY: doc-tests +doc-tests: + ${INVENV} pytest ${cov} --cov-report=${cov_report} --doctest-modules ${PROJECT_SRC} + +.PHONY: type-check +# check types +type-check: + ${INVENV} mypy ${PROJECT_SRC} ${tests} + +.PHONY: lint +# lint the code +lint: + ${INVENV} ruff check ${PROJECT_SRC} ${tests} + +.PHONY: lint-fix +# lint the code (and fix if possible) +lint-fix: + ${INVENV} ruff check --fix ${PROJECT_SRC} ${tests} + +part := "patch" +bumpversion: + ${INVENV} bump-my-version bump ${part} + +bumpversion-show: + ${INVENV} bump-my-version show-bump + +# run formatter(s) +fmt: + ${INVENV} ruff format ${PROJECT_SRC} ${tests} + +.PHONY: check-fmt +# check formatting +check-fmt: + ${INVENV} ruff format --check ${PROJECT_SRC} ${tests} + +build: + pdm build + +branch := "main" +publish: + git push -u origin ${branch} --tags + + +.PHONY: prepare-release +prepare-release: update-changelog tests/requirements-testing.lock + +# we use lock extension so that dependabot doesn't pick up changes in this file +tests/requirements-testing.lock: pyproject.toml pdm.lock + pdm export --dev --format requirements --output $@ + +.PHONY: update-changelog +update-changelog: CHANGELOG.md + +CHANGELOG.md: + git cliff --unreleased --prepend $@ + +# update snapshots for `syrupy` +.PHONY: snapshot-update +snapshot-update: + ${INVENV} pytest --snapshot-update \ No newline at end of file diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..18a6155 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,8 @@ +[mypy] +mypy_path = src +namespace_packages = True +explicit_package_bases = True +show_error_codes = True +ignore_missing_imports = True +python_version = 3.8 +; plugins = adt.mypy_plugin diff --git a/pdm.lock b/pdm.lock new file mode 100644 index 0000000..876864e --- /dev/null +++ b/pdm.lock @@ -0,0 +1,383 @@ +# This file is @generated by PDM. +# It is not intended for manual editing. + +[metadata] +groups = ["default", "dev"] +strategy = ["cross_platform", "inherit_metadata"] +lock_version = "4.4.1" +content_hash = "sha256:45a31179520f4206be41a3c63086952a5a2fb833ae1a85e98262d71ed7988196" + +[[package]] +name = "colorama" +version = "0.4.6" +requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +summary = "Cross-platform colored terminal text." +groups = ["dev"] +marker = "sys_platform == \"win32\"" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "colored" +version = "1.4.4" +summary = "Simple library for color and formatting to terminal" +groups = ["dev"] +files = [ + {file = "colored-1.4.4.tar.gz", hash = "sha256:04ff4d4dd514274fe3b99a21bb52fb96f2688c01e93fba7bef37221e7cb56ce0"}, +] + +[[package]] +name = "coverage" +version = "7.5.0" +requires_python = ">=3.8" +summary = "Code coverage measurement for Python" +groups = ["dev"] +files = [ + {file = "coverage-7.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:432949a32c3e3f820af808db1833d6d1631664d53dd3ce487aa25d574e18ad1c"}, + {file = "coverage-7.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2bd7065249703cbeb6d4ce679c734bef0ee69baa7bff9724361ada04a15b7e3b"}, + {file = "coverage-7.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbfe6389c5522b99768a93d89aca52ef92310a96b99782973b9d11e80511f932"}, + {file = "coverage-7.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:39793731182c4be939b4be0cdecde074b833f6171313cf53481f869937129ed3"}, + {file = "coverage-7.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85a5dbe1ba1bf38d6c63b6d2c42132d45cbee6d9f0c51b52c59aa4afba057517"}, + {file = "coverage-7.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:357754dcdfd811462a725e7501a9b4556388e8ecf66e79df6f4b988fa3d0b39a"}, + {file = "coverage-7.5.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a81eb64feded34f40c8986869a2f764f0fe2db58c0530d3a4afbcde50f314880"}, + {file = "coverage-7.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:51431d0abbed3a868e967f8257c5faf283d41ec882f58413cf295a389bb22e58"}, + {file = "coverage-7.5.0-cp310-cp310-win32.whl", hash = "sha256:f609ebcb0242d84b7adeee2b06c11a2ddaec5464d21888b2c8255f5fd6a98ae4"}, + {file = "coverage-7.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:6782cd6216fab5a83216cc39f13ebe30adfac2fa72688c5a4d8d180cd52e8f6a"}, + {file = "coverage-7.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e768d870801f68c74c2b669fc909839660180c366501d4cc4b87efd6b0eee375"}, + {file = "coverage-7.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:84921b10aeb2dd453247fd10de22907984eaf80901b578a5cf0bb1e279a587cb"}, + {file = "coverage-7.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:710c62b6e35a9a766b99b15cdc56d5aeda0914edae8bb467e9c355f75d14ee95"}, + {file = "coverage-7.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c379cdd3efc0658e652a14112d51a7668f6bfca7445c5a10dee7eabecabba19d"}, + {file = "coverage-7.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fea9d3ca80bcf17edb2c08a4704259dadac196fe5e9274067e7a20511fad1743"}, + {file = "coverage-7.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:41327143c5b1d715f5f98a397608f90ab9ebba606ae4e6f3389c2145410c52b1"}, + {file = "coverage-7.5.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:565b2e82d0968c977e0b0f7cbf25fd06d78d4856289abc79694c8edcce6eb2de"}, + {file = "coverage-7.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cf3539007202ebfe03923128fedfdd245db5860a36810136ad95a564a2fdffff"}, + {file = "coverage-7.5.0-cp311-cp311-win32.whl", hash = "sha256:bf0b4b8d9caa8d64df838e0f8dcf68fb570c5733b726d1494b87f3da85db3a2d"}, + {file = "coverage-7.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c6384cc90e37cfb60435bbbe0488444e54b98700f727f16f64d8bfda0b84656"}, + {file = "coverage-7.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fed7a72d54bd52f4aeb6c6e951f363903bd7d70bc1cad64dd1f087980d309ab9"}, + {file = "coverage-7.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cbe6581fcff7c8e262eb574244f81f5faaea539e712a058e6707a9d272fe5b64"}, + {file = "coverage-7.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad97ec0da94b378e593ef532b980c15e377df9b9608c7c6da3506953182398af"}, + {file = "coverage-7.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bd4bacd62aa2f1a1627352fe68885d6ee694bdaebb16038b6e680f2924a9b2cc"}, + {file = "coverage-7.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adf032b6c105881f9d77fa17d9eebe0ad1f9bfb2ad25777811f97c5362aa07f2"}, + {file = "coverage-7.5.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4ba01d9ba112b55bfa4b24808ec431197bb34f09f66f7cb4fd0258ff9d3711b1"}, + {file = "coverage-7.5.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f0bfe42523893c188e9616d853c47685e1c575fe25f737adf473d0405dcfa7eb"}, + {file = "coverage-7.5.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a9a7ef30a1b02547c1b23fa9a5564f03c9982fc71eb2ecb7f98c96d7a0db5cf2"}, + {file = "coverage-7.5.0-cp312-cp312-win32.whl", hash = "sha256:3c2b77f295edb9fcdb6a250f83e6481c679335ca7e6e4a955e4290350f2d22a4"}, + {file = "coverage-7.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:427e1e627b0963ac02d7c8730ca6d935df10280d230508c0ba059505e9233475"}, + {file = "coverage-7.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9dd88fce54abbdbf4c42fb1fea0e498973d07816f24c0e27a1ecaf91883ce69e"}, + {file = "coverage-7.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a898c11dca8f8c97b467138004a30133974aacd572818c383596f8d5b2eb04a9"}, + {file = "coverage-7.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07dfdd492d645eea1bd70fb1d6febdcf47db178b0d99161d8e4eed18e7f62fe7"}, + {file = "coverage-7.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3d117890b6eee85887b1eed41eefe2e598ad6e40523d9f94c4c4b213258e4a4"}, + {file = "coverage-7.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6afd2e84e7da40fe23ca588379f815fb6dbbb1b757c883935ed11647205111cb"}, + {file = "coverage-7.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a9960dd1891b2ddf13a7fe45339cd59ecee3abb6b8326d8b932d0c5da208104f"}, + {file = "coverage-7.5.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ced268e82af993d7801a9db2dbc1d2322e786c5dc76295d8e89473d46c6b84d4"}, + {file = "coverage-7.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e7c211f25777746d468d76f11719e64acb40eed410d81c26cefac641975beb88"}, + {file = "coverage-7.5.0-cp38-cp38-win32.whl", hash = "sha256:262fffc1f6c1a26125d5d573e1ec379285a3723363f3bd9c83923c9593a2ac25"}, + {file = "coverage-7.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:eed462b4541c540d63ab57b3fc69e7d8c84d5957668854ee4e408b50e92ce26a"}, + {file = "coverage-7.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d0194d654e360b3e6cc9b774e83235bae6b9b2cac3be09040880bb0e8a88f4a1"}, + {file = "coverage-7.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:33c020d3322662e74bc507fb11488773a96894aa82a622c35a5a28673c0c26f5"}, + {file = "coverage-7.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbdf2cae14a06827bec50bd58e49249452d211d9caddd8bd80e35b53cb04631"}, + {file = "coverage-7.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3235d7c781232e525b0761730e052388a01548bd7f67d0067a253887c6e8df46"}, + {file = "coverage-7.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2de4e546f0ec4b2787d625e0b16b78e99c3e21bc1722b4977c0dddf11ca84e"}, + {file = "coverage-7.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4d0e206259b73af35c4ec1319fd04003776e11e859936658cb6ceffdeba0f5be"}, + {file = "coverage-7.5.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2055c4fb9a6ff624253d432aa471a37202cd8f458c033d6d989be4499aed037b"}, + {file = "coverage-7.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:075299460948cd12722a970c7eae43d25d37989da682997687b34ae6b87c0ef0"}, + {file = "coverage-7.5.0-cp39-cp39-win32.whl", hash = "sha256:280132aada3bc2f0fac939a5771db4fbb84f245cb35b94fae4994d4c1f80dae7"}, + {file = "coverage-7.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:c58536f6892559e030e6924896a44098bc1290663ea12532c78cef71d0df8493"}, + {file = "coverage-7.5.0-pp38.pp39.pp310-none-any.whl", hash = "sha256:2b57780b51084d5223eee7b59f0d4911c31c16ee5aa12737c7a02455829ff067"}, + {file = "coverage-7.5.0.tar.gz", hash = "sha256:cf62d17310f34084c59c01e027259076479128d11e4661bb6c9acb38c5e19bb8"}, +] + +[[package]] +name = "coverage" +version = "7.5.0" +extras = ["toml"] +requires_python = ">=3.8" +summary = "Code coverage measurement for Python" +groups = ["dev"] +dependencies = [ + "coverage==7.5.0", + "tomli; python_full_version <= \"3.11.0a6\"", +] +files = [ + {file = "coverage-7.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:432949a32c3e3f820af808db1833d6d1631664d53dd3ce487aa25d574e18ad1c"}, + {file = "coverage-7.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2bd7065249703cbeb6d4ce679c734bef0ee69baa7bff9724361ada04a15b7e3b"}, + {file = "coverage-7.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbfe6389c5522b99768a93d89aca52ef92310a96b99782973b9d11e80511f932"}, + {file = "coverage-7.5.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:39793731182c4be939b4be0cdecde074b833f6171313cf53481f869937129ed3"}, + {file = "coverage-7.5.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85a5dbe1ba1bf38d6c63b6d2c42132d45cbee6d9f0c51b52c59aa4afba057517"}, + {file = "coverage-7.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:357754dcdfd811462a725e7501a9b4556388e8ecf66e79df6f4b988fa3d0b39a"}, + {file = "coverage-7.5.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a81eb64feded34f40c8986869a2f764f0fe2db58c0530d3a4afbcde50f314880"}, + {file = "coverage-7.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:51431d0abbed3a868e967f8257c5faf283d41ec882f58413cf295a389bb22e58"}, + {file = "coverage-7.5.0-cp310-cp310-win32.whl", hash = "sha256:f609ebcb0242d84b7adeee2b06c11a2ddaec5464d21888b2c8255f5fd6a98ae4"}, + {file = "coverage-7.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:6782cd6216fab5a83216cc39f13ebe30adfac2fa72688c5a4d8d180cd52e8f6a"}, + {file = "coverage-7.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e768d870801f68c74c2b669fc909839660180c366501d4cc4b87efd6b0eee375"}, + {file = "coverage-7.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:84921b10aeb2dd453247fd10de22907984eaf80901b578a5cf0bb1e279a587cb"}, + {file = "coverage-7.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:710c62b6e35a9a766b99b15cdc56d5aeda0914edae8bb467e9c355f75d14ee95"}, + {file = "coverage-7.5.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c379cdd3efc0658e652a14112d51a7668f6bfca7445c5a10dee7eabecabba19d"}, + {file = "coverage-7.5.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fea9d3ca80bcf17edb2c08a4704259dadac196fe5e9274067e7a20511fad1743"}, + {file = "coverage-7.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:41327143c5b1d715f5f98a397608f90ab9ebba606ae4e6f3389c2145410c52b1"}, + {file = "coverage-7.5.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:565b2e82d0968c977e0b0f7cbf25fd06d78d4856289abc79694c8edcce6eb2de"}, + {file = "coverage-7.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cf3539007202ebfe03923128fedfdd245db5860a36810136ad95a564a2fdffff"}, + {file = "coverage-7.5.0-cp311-cp311-win32.whl", hash = "sha256:bf0b4b8d9caa8d64df838e0f8dcf68fb570c5733b726d1494b87f3da85db3a2d"}, + {file = "coverage-7.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:9c6384cc90e37cfb60435bbbe0488444e54b98700f727f16f64d8bfda0b84656"}, + {file = "coverage-7.5.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fed7a72d54bd52f4aeb6c6e951f363903bd7d70bc1cad64dd1f087980d309ab9"}, + {file = "coverage-7.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cbe6581fcff7c8e262eb574244f81f5faaea539e712a058e6707a9d272fe5b64"}, + {file = "coverage-7.5.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad97ec0da94b378e593ef532b980c15e377df9b9608c7c6da3506953182398af"}, + {file = "coverage-7.5.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bd4bacd62aa2f1a1627352fe68885d6ee694bdaebb16038b6e680f2924a9b2cc"}, + {file = "coverage-7.5.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adf032b6c105881f9d77fa17d9eebe0ad1f9bfb2ad25777811f97c5362aa07f2"}, + {file = "coverage-7.5.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:4ba01d9ba112b55bfa4b24808ec431197bb34f09f66f7cb4fd0258ff9d3711b1"}, + {file = "coverage-7.5.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:f0bfe42523893c188e9616d853c47685e1c575fe25f737adf473d0405dcfa7eb"}, + {file = "coverage-7.5.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a9a7ef30a1b02547c1b23fa9a5564f03c9982fc71eb2ecb7f98c96d7a0db5cf2"}, + {file = "coverage-7.5.0-cp312-cp312-win32.whl", hash = "sha256:3c2b77f295edb9fcdb6a250f83e6481c679335ca7e6e4a955e4290350f2d22a4"}, + {file = "coverage-7.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:427e1e627b0963ac02d7c8730ca6d935df10280d230508c0ba059505e9233475"}, + {file = "coverage-7.5.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9dd88fce54abbdbf4c42fb1fea0e498973d07816f24c0e27a1ecaf91883ce69e"}, + {file = "coverage-7.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a898c11dca8f8c97b467138004a30133974aacd572818c383596f8d5b2eb04a9"}, + {file = "coverage-7.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07dfdd492d645eea1bd70fb1d6febdcf47db178b0d99161d8e4eed18e7f62fe7"}, + {file = "coverage-7.5.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3d117890b6eee85887b1eed41eefe2e598ad6e40523d9f94c4c4b213258e4a4"}, + {file = "coverage-7.5.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6afd2e84e7da40fe23ca588379f815fb6dbbb1b757c883935ed11647205111cb"}, + {file = "coverage-7.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:a9960dd1891b2ddf13a7fe45339cd59ecee3abb6b8326d8b932d0c5da208104f"}, + {file = "coverage-7.5.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ced268e82af993d7801a9db2dbc1d2322e786c5dc76295d8e89473d46c6b84d4"}, + {file = "coverage-7.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e7c211f25777746d468d76f11719e64acb40eed410d81c26cefac641975beb88"}, + {file = "coverage-7.5.0-cp38-cp38-win32.whl", hash = "sha256:262fffc1f6c1a26125d5d573e1ec379285a3723363f3bd9c83923c9593a2ac25"}, + {file = "coverage-7.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:eed462b4541c540d63ab57b3fc69e7d8c84d5957668854ee4e408b50e92ce26a"}, + {file = "coverage-7.5.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d0194d654e360b3e6cc9b774e83235bae6b9b2cac3be09040880bb0e8a88f4a1"}, + {file = "coverage-7.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:33c020d3322662e74bc507fb11488773a96894aa82a622c35a5a28673c0c26f5"}, + {file = "coverage-7.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbdf2cae14a06827bec50bd58e49249452d211d9caddd8bd80e35b53cb04631"}, + {file = "coverage-7.5.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3235d7c781232e525b0761730e052388a01548bd7f67d0067a253887c6e8df46"}, + {file = "coverage-7.5.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2de4e546f0ec4b2787d625e0b16b78e99c3e21bc1722b4977c0dddf11ca84e"}, + {file = "coverage-7.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4d0e206259b73af35c4ec1319fd04003776e11e859936658cb6ceffdeba0f5be"}, + {file = "coverage-7.5.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2055c4fb9a6ff624253d432aa471a37202cd8f458c033d6d989be4499aed037b"}, + {file = "coverage-7.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:075299460948cd12722a970c7eae43d25d37989da682997687b34ae6b87c0ef0"}, + {file = "coverage-7.5.0-cp39-cp39-win32.whl", hash = "sha256:280132aada3bc2f0fac939a5771db4fbb84f245cb35b94fae4994d4c1f80dae7"}, + {file = "coverage-7.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:c58536f6892559e030e6924896a44098bc1290663ea12532c78cef71d0df8493"}, + {file = "coverage-7.5.0-pp38.pp39.pp310-none-any.whl", hash = "sha256:2b57780b51084d5223eee7b59f0d4911c31c16ee5aa12737c7a02455829ff067"}, + {file = "coverage-7.5.0.tar.gz", hash = "sha256:cf62d17310f34084c59c01e027259076479128d11e4661bb6c9acb38c5e19bb8"}, +] + +[[package]] +name = "diff-match-patch" +version = "20230430" +requires_python = ">=3.7" +summary = "Diff Match and Patch" +groups = ["default"] +files = [ + {file = "diff-match-patch-20230430.tar.gz", hash = "sha256:953019cdb9c9d2c9e47b5b12bcff3cf4746fc4598eb406076fa1fc27e6a1f15c"}, + {file = "diff_match_patch-20230430-py3-none-any.whl", hash = "sha256:dce43505fb7b1b317de7195579388df0746d90db07015ed47a85e5e44930ef93"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.1" +requires_python = ">=3.7" +summary = "Backport of PEP 654 (exception groups)" +groups = ["dev"] +marker = "python_version < \"3.11\"" +files = [ + {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"}, + {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"}, +] + +[[package]] +name = "iniconfig" +version = "2.0.0" +requires_python = ">=3.7" +summary = "brain-dead simple config-ini parsing" +groups = ["dev"] +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "more-itertools" +version = "10.2.0" +requires_python = ">=3.8" +summary = "More routines for operating on iterables, beyond itertools" +groups = ["default"] +files = [ + {file = "more-itertools-10.2.0.tar.gz", hash = "sha256:8fccb480c43d3e99a00087634c06dd02b0d50fbf088b380de5a41a015ec239e1"}, + {file = "more_itertools-10.2.0-py3-none-any.whl", hash = "sha256:686b06abe565edfab151cb8fd385a05651e1fdf8f0a14191e4439283421f8684"}, +] + +[[package]] +name = "mypy" +version = "1.10.0" +requires_python = ">=3.8" +summary = "Optional static typing for Python" +groups = ["dev"] +dependencies = [ + "mypy-extensions>=1.0.0", + "tomli>=1.1.0; python_version < \"3.11\"", + "typing-extensions>=4.1.0", +] +files = [ + {file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"}, + {file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"}, + {file = "mypy-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2"}, + {file = "mypy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9"}, + {file = "mypy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051"}, + {file = "mypy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1"}, + {file = "mypy-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee"}, + {file = "mypy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de"}, + {file = "mypy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7"}, + {file = "mypy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53"}, + {file = "mypy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b"}, + {file = "mypy-1.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30"}, + {file = "mypy-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e"}, + {file = "mypy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5"}, + {file = "mypy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda"}, + {file = "mypy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0"}, + {file = "mypy-1.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727"}, + {file = "mypy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4"}, + {file = "mypy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061"}, + {file = "mypy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f"}, + {file = "mypy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976"}, + {file = "mypy-1.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec"}, + {file = "mypy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821"}, + {file = "mypy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746"}, + {file = "mypy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a"}, + {file = "mypy-1.10.0-py3-none-any.whl", hash = "sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee"}, + {file = "mypy-1.10.0.tar.gz", hash = "sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131"}, +] + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +requires_python = ">=3.5" +summary = "Type system extensions for programs checked with the mypy type checker." +groups = ["dev"] +files = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] + +[[package]] +name = "packaging" +version = "24.0" +requires_python = ">=3.7" +summary = "Core utilities for Python packages" +groups = ["dev"] +files = [ + {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"}, + {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"}, +] + +[[package]] +name = "pluggy" +version = "1.5.0" +requires_python = ">=3.8" +summary = "plugin and hook calling mechanisms for python" +groups = ["dev"] +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[[package]] +name = "pytest" +version = "7.4.4" +requires_python = ">=3.7" +summary = "pytest: simple powerful testing with Python" +groups = ["dev"] +dependencies = [ + "colorama; sys_platform == \"win32\"", + "exceptiongroup>=1.0.0rc8; python_version < \"3.11\"", + "iniconfig", + "packaging", + "pluggy<2.0,>=0.12", + "tomli>=1.0.0; python_version < \"3.11\"", +] +files = [ + {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, + {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, +] + +[[package]] +name = "pytest-cov" +version = "5.0.0" +requires_python = ">=3.8" +summary = "Pytest plugin for measuring coverage." +groups = ["dev"] +dependencies = [ + "coverage[toml]>=5.2.1", + "pytest>=4.6", +] +files = [ + {file = "pytest-cov-5.0.0.tar.gz", hash = "sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857"}, + {file = "pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652"}, +] + +[[package]] +name = "ruff" +version = "0.4.2" +requires_python = ">=3.7" +summary = "An extremely fast Python linter and code formatter, written in Rust." +groups = ["dev"] +files = [ + {file = "ruff-0.4.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:8d14dc8953f8af7e003a485ef560bbefa5f8cc1ad994eebb5b12136049bbccc5"}, + {file = "ruff-0.4.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:24016ed18db3dc9786af103ff49c03bdf408ea253f3cb9e3638f39ac9cf2d483"}, + {file = "ruff-0.4.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e2e06459042ac841ed510196c350ba35a9b24a643e23db60d79b2db92af0c2b"}, + {file = "ruff-0.4.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3afabaf7ba8e9c485a14ad8f4122feff6b2b93cc53cd4dad2fd24ae35112d5c5"}, + {file = "ruff-0.4.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:799eb468ea6bc54b95527143a4ceaf970d5aa3613050c6cff54c85fda3fde480"}, + {file = "ruff-0.4.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:ec4ba9436a51527fb6931a8839af4c36a5481f8c19e8f5e42c2f7ad3a49f5069"}, + {file = "ruff-0.4.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a2243f8f434e487c2a010c7252150b1fdf019035130f41b77626f5655c9ca22"}, + {file = "ruff-0.4.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8772130a063f3eebdf7095da00c0b9898bd1774c43b336272c3e98667d4fb8fa"}, + {file = "ruff-0.4.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ab165ef5d72392b4ebb85a8b0fbd321f69832a632e07a74794c0e598e7a8376"}, + {file = "ruff-0.4.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:1f32cadf44c2020e75e0c56c3408ed1d32c024766bd41aedef92aa3ca28eef68"}, + {file = "ruff-0.4.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:22e306bf15e09af45ca812bc42fa59b628646fa7c26072555f278994890bc7ac"}, + {file = "ruff-0.4.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:82986bb77ad83a1719c90b9528a9dd663c9206f7c0ab69282af8223566a0c34e"}, + {file = "ruff-0.4.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:652e4ba553e421a6dc2a6d4868bc3b3881311702633eb3672f9f244ded8908cd"}, + {file = "ruff-0.4.2-py3-none-win32.whl", hash = "sha256:7891ee376770ac094da3ad40c116258a381b86c7352552788377c6eb16d784fe"}, + {file = "ruff-0.4.2-py3-none-win_amd64.whl", hash = "sha256:5ec481661fb2fd88a5d6cf1f83403d388ec90f9daaa36e40e2c003de66751798"}, + {file = "ruff-0.4.2-py3-none-win_arm64.whl", hash = "sha256:cbd1e87c71bca14792948c4ccb51ee61c3296e164019d2d484f3eaa2d360dfaf"}, + {file = "ruff-0.4.2.tar.gz", hash = "sha256:33bcc160aee2520664bc0859cfeaebc84bb7323becff3f303b8f1f2d81cb4edc"}, +] + +[[package]] +name = "strenum" +version = "0.4.15" +summary = "An Enum that inherits from str." +groups = ["default"] +files = [ + {file = "StrEnum-0.4.15-py3-none-any.whl", hash = "sha256:a30cda4af7cc6b5bf52c8055bc4bf4b2b6b14a93b574626da33df53cf7740659"}, + {file = "StrEnum-0.4.15.tar.gz", hash = "sha256:878fb5ab705442070e4dd1929bb5e2249511c0bcf2b0eeacf3bcd80875c82eff"}, +] + +[[package]] +name = "syrupy" +version = "3.0.6" +requires_python = ">=3.7,<4" +summary = "Pytest Snapshot Test Utility" +groups = ["dev"] +dependencies = [ + "colored<2.0.0,>=1.3.92", + "pytest<8.0.0,>=5.1.0", +] +files = [ + {file = "syrupy-3.0.6-py3-none-any.whl", hash = "sha256:9c18e22264026b34239bcc87ab7cc8d893eb17236ea7dae634217ea4f22a848d"}, + {file = "syrupy-3.0.6.tar.gz", hash = "sha256:583aa5ca691305c27902c3e29a1ce9da50ff9ab5f184c54b1dc124a16e4a6cf4"}, +] + +[[package]] +name = "tomli" +version = "2.0.1" +requires_python = ">=3.7" +summary = "A lil' TOML parser" +groups = ["dev"] +marker = "python_version < \"3.11\"" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[[package]] +name = "typing-extensions" +version = "4.11.0" +requires_python = ">=3.8" +summary = "Backported and Experimental Type Hints for Python 3.8+" +groups = ["default", "dev"] +files = [ + {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"}, + {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..50d1d34 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,33 @@ +[project] +name = "parallel-corpus" +version = "0.1.0" +description = "TBD" +authors = [ + { name = "Kristoffer Andersson", email = "kristoffer.andersson@gu.se" }, +] +dependencies = [ + "diff-match-patch>=20230430", + "more-itertools>=10.2.0", + "typing-extensions>=4.11.0", + "strenum>=0.4.15", # For StrEnum i Python < 3.10 +] +requires-python = ">=3.8" +readme = "README.md" +license = { text = "MIT" } + +[build-system] +requires = ["pdm-backend"] +build-backend = "pdm.backend" + + +[tool.pdm] +distribution = true + +[tool.pdm.dev-dependencies] +dev = [ + "syrupy>=3.0.6", + "pytest>=7.4.4", + "ruff>=0.4.1", + "mypy>=1.9.0", + "pytest-cov>=5.0.0", +] diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..0c8b27f --- /dev/null +++ b/ruff.toml @@ -0,0 +1,61 @@ +line-length = 97 + +target-version = "py38" + +[lint] +# Enable flake8-bugbear (`B`) rules. +select = [ + "A", + # "ANN", + "B", + "BLE", + "C4", + "C90", + # "D", + "E", + "F", + "FBT", + "I", + "RUF", + "S", + "YTT", +] + +# Never enforce `E501` (line length violations). +# ignore = ["E501"] +ignore = ["ANN101", "ANN102", "D203", "D213"] + + +# Avoid trying to fix flake8-bugbear (`B`) violations. +unfixable = ["B"] + +# Ignore `E402` (import violations) in all `__init__.py` files, and in `path/to/file.py`. +[lint.per-file-ignores] +"tests/*" = ["D100", "D101", "D102", "D103", "D104", "S101"] +"bases/sblex/webapp/tests/**/*" = [ + "D100", + "D101", + "D102", + "D103", + "D104", + "S101", +] +"src/sblex/app.py" = ["A", "E", "F", "I"] +"src/sblex/compound.py" = ["A", "E", "F", "I", "RUF"] +"src/sblex/dist.py" = ["A", "E", "F", "I"] +"src/sblex/fullform*.py" = ["A", "E", "F", "I", "B", "C"] +"src/sblex/glsib*.py" = ["A", "E", "F", "I"] +"src/sblex/handler.py" = ["A", "E", "F", "I", "C", "S"] +"src/sblex/lem.py" = ["A", "E", "F", "I"] +"src/sblex/lemma.py" = ["A", "E", "F", "I"] +"src/sblex/lexeme.py" = ["A", "E", "F", "I"] +"src/sblex/lsib.py" = ["A", "E", "F", "I"] +"src/sblex/md1.py" = ["A", "E", "F", "I"] +"src/sblex/paradigms.py" = ["A", "E", "F", "I", "S"] +"src/sblex/plist.py" = ["A", "E", "F", "I"] +"src/sblex/pos.py" = ["A", "E", "F", "I"] +"src/sblex/saldo_util.py" = ["A", "B", "C", "E", "F", "FBT", "I"] +"src/sblex/sib.py" = ["A", "E", "F", "I"] +"src/sblex/table.py" = ["A", "E", "F", "I", "S"] +"tests/e2e/webapp/test_fullform_lex_api.py" = ["E501"] +# "__init__.py" = ["E402"] diff --git a/src/parallel_corpus/__init__.py b/src/parallel_corpus/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/parallel_corpus/graph.py b/src/parallel_corpus/graph.py new file mode 100644 index 0000000..3f74f9e --- /dev/null +++ b/src/parallel_corpus/graph.py @@ -0,0 +1,387 @@ +import itertools +import logging +import re +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, TypeVar + +import parallel_corpus.shared.ranges +import parallel_corpus.shared.str_map +import parallel_corpus.shared.union_find +from parallel_corpus import shared, token +from parallel_corpus.shared import dicts, diffs, ids, lists +from parallel_corpus.shared.unique_check import UniqueCheck +from parallel_corpus.source_target import Side, SourceTarget, map_sides +from parallel_corpus.token import Token + +A = TypeVar("A") +B = TypeVar("B") + + +ALL_WHITESPACE = re.compile(r"^\s+$") +NO_WHITESPACE_AT_END = re.compile(r"\S$") + +logger = logging.getLogger(__name__) + + +@dataclass +class Edge: + # a copy of the identifier used in the edges object of the graph + id: str + # these are ids to source and target tokens + ids: List[str] + # labels on this edge + labels: List[str] + # is this manually or automatically aligned + manual: bool + comment: Optional[str] = None + + +Edges = Dict[str, Edge] + + +@dataclass +class Graph(SourceTarget[List[Token]]): + edges: Edges + comment: Optional[str] = None + + def copy_with_updated_side_and_edges( + self, side: Side, new_tokens: List[Token], edges: Edges + ) -> "Graph": + source = self.source if side == Side.target else new_tokens + target = new_tokens if side == Side.target else self.target + return Graph(source=source, target=target, edges=edges, comment=self.comment) + + def copy_with_edges(self, edges: Edges) -> "Graph": + return Graph(source=self.source, target=self.target, edges=edges, comment=self.comment) + + +def next_id(g: Graph) -> int: + return ids.next_id(itertools.chain((t.id for t in g.target), (s.id for s in g.source))) + + +def edge( + ids: List[str], + labels: List[str], + *, + comment: Optional[str] = None, + manual: bool = False, +) -> Edge: + ids_sorted = sorted(ids) + labels_nub = shared.uniq(labels) + return Edge( + id=f"e-{'-'.join(ids_sorted)}", + ids=ids_sorted, + labels=labels_nub, + manual=manual, + comment=comment, + ) + + +def edge_record(es: Iterable[Edge]) -> Dict[str, Edge]: + return {e.id: e for e in es} + + +def init(s: str, *, manual: bool = False) -> Graph: + return init_from(token.tokenize(s), manual=manual) + + +def init_from(tokens: List[str], *, manual: bool = False) -> Graph: + return align( + Graph( + source=token.identify(tokens, "s"), + target=token.identify(tokens, "t"), + edges=edge_record( + (edge([f"s{i}", f"t{i}"], [], manual=manual) for i, _ in enumerate(tokens)) + ), + ) + ) + + +def modify(g: Graph, from_: int, to: int, text: str, side: Side = Side.target) -> Graph: + return align(unaligned_modify(g, from_, to, text, side)) + + +def set_target(g: Graph, text: str) -> Graph: + return align(unaligned_set_side(g, Side.target, text)) + + +def merge_edges(*es) -> Edge: + ids = [] + labels = [] + manual = False + comments = [] + for e in es: + ids.extend(iter(e.ids)) + labels.extend(iter(e.labels)) + manual = manual or e.manual + if e.comment is not None: + comments.append(e.comment) + return edge( + ids=ids, + labels=labels, + manual=manual, + comment="\n\n".join(comments) if comments else None, + ) + + +zero_edge = merge_edges() + + +def align(g: Graph) -> Graph: + # Use a union-find to group characters into edges. + uf = parallel_corpus.shared.union_find.poly_union_find(lambda u: u) + em = edge_map(g) + chars = map_sides( + g, + lambda tokens, _side: list( + itertools.chain( + *map(to_char_ids, filter(lambda token: not em[token.id].manual, tokens)) + ) + ), + ) + char_diff = diffs.hdiff(chars.source, chars.target, lambda u: u.char, lambda u: u.char) + for c in char_diff: + # these undefined makes the alignment skip spaces. + # they originate from to_char_ids + if c.change == diffs.ChangeType.CONSTANT and ( + c.a is not None and c.b is not None and c.a.id is not None and c.b.id is not None + ): + uf.union(c.a.id, c.b.id) + proto_edges = {k: e for k, e in g.edges.items() if e.manual} + first: UniqueCheck[str] = UniqueCheck() + + def update_edges(tokens, _side): + for tok in tokens: + e_repr = em[tok.id] + if not e_repr.manual: + labels = e_repr.labels if first(e_repr.id) else [] + e_token = edge([tok.id], labels, manual=False, comment=e_repr.comment) + dicts.modify( + proto_edges, + uf.find(tok.id), + zero_edge, + lambda e: merge_edges(e, e_token), # noqa: B023 + ) + + map_sides(g, update_edges) + edges = edge_record(dicts.traverse(proto_edges, lambda e, _: e)) + return g.copy_with_edges(edges) + + +def rearrange(g: Graph, begin: int, end: int, dest: int) -> Graph: + return align(unaligned_rearrange(g, begin, end, dest)) + + +def target_text(g: SourceTarget[List[token.Text]]) -> str: + return token.text(g.target) + + +@dataclass +class CharIdPair: + char: str + id: Optional[str] = None + + +def to_char_ids(token: Token) -> List[CharIdPair]: + return parallel_corpus.shared.str_map.str_map( + token.text, + lambda char, _i: CharIdPair(char=char, id=None if char == " " else token.id), + ) + + +def edge_map(g: Graph) -> Dict[str, Edge]: + edges = {} + for e in g.edges.values(): + for i in e.ids: + edges[i] = e + return edges + + +def unaligned_set_side(g: Graph, side: Side, text: str) -> Graph: + text0 = get_side_text(g, side) + edits = parallel_corpus.shared.ranges.edit_range(text0, text) + + from_, to = edits["from"], edits["to"] + new_text = text[from_ : (len(text) - (len(text0) - to))] + return unaligned_modify(g, from_, to, new_text, side) + + +def unaligned_modify( + g: Graph, from_: int, to: int, text: str, side: Side = Side.target +) -> Graph: + """Replace the text at some position, merging the spans it touches upon. + + >>> show = lambda g: [t.text for t in g.target] + >>> ids = lambda g: " ".join(t.id for t in g.target) + >>> g = init('test graph hello') + >>> assert show(g) == ['test ', 'graph ', 'hello '] + >>> show(unaligned_modify(g, 0, 0, 'new')) + ['newtest ', 'graph ', 'hello '] + + >>> show(unaligned_modify(g, 0, 1, 'new')) + ['newest ', 'graph ', 'hello '] + + >>> show(unaligned_modify(g, 0, 5, 'new ')) + ['new ', 'graph ', 'hello '] + + >>> show(unaligned_modify(g, 0, 5, 'new')) + ['newgraph ', 'hello '] + + >>> show(unaligned_modify(g, 5, 5, ' ')) + ['test ', ' graph ', 'hello '] + + >>> show(unaligned_modify(g, 5, 6, ' ')) + ['test ', ' raph ', 'hello '] + + >>> show(unaligned_modify(g, 0, 15, '_')) + ['_o '] + + >>> show(unaligned_modify(g, 0, 16, '_')) + ['_ '] + + >>> show(unaligned_modify(g, 0, 17, '_')) + ['_ '] + + >>> show(unaligned_modify(g, 16, 16, ' !')) + ['test ', 'graph ', 'hello ', '! '] + + + Indexes are character offsets (use CodeMirror's doc.posFromIndex and doc.indexFromPos to convert) + """ # noqa: E501 + + tokens = get_side_texts(g, side) + token_at = token.token_at(tokens, from_) + from_token, from_ix = token_at["token"], token_at["offset"] + pre = (tokens[from_token] or "")[:from_ix] + if to == len(get_side_text(g, side)): + return unaligned_modify_tokens(g, from_token, len(g.get_side(side)), pre + text, side) + to_token_at = token.token_at(tokens, to) + to_token, to_ix = to_token_at["token"], to_token_at["offset"] + post = (tokens[to_token] or "")[to_ix:] + return unaligned_modify_tokens(g, from_token, to_token + 1, pre + text + post, side) + + +def get_side_text(g: Graph, side: Side) -> str: + return token.text(g.get_side(side)) + + +def get_side_texts(g: Graph, side: Side) -> List[str]: + return token.texts(g.get_side(side)) + + +def unaligned_modify_tokens( # noqa: C901 + g: Graph, from_: int, to: int, text: str, side: Side = Side.target +) -> Graph: + """# /** Replace the text at some position, merging the spans it touches upon. + + # const show = (g: Graph) => g.target.map(t => t.text) + # const ids = (g: Graph) => g.target.map(t => t.id).join(' ') + # const g = init('test graph hello') + # show(g) // => ['test ', 'graph ', 'hello '] + # show(unaligned_modify_tokens(g, 0, 0, 'this ')) // => ['this ', 'test ', 'graph ', 'hello '] + # show(unaligned_modify_tokens(g, 0, 1, 'this ')) // => ['this ', 'graph ', 'hello '] + # show(unaligned_modify_tokens(g, 0, 1, ' white ')) // => [' white ', 'graph ', 'hello '] + # show(unaligned_modify_tokens(g, 0, 1, 'this')) // => ['thisgraph ', 'hello '] + # show(unaligned_modify_tokens(g, 1, 2, 'graph')) // => ['test ', 'graphhello '] + # show(unaligned_modify_tokens(g, 1, 2, ' graph ')) // => ['test ', ' graph ', 'hello '] + # show(unaligned_modify_tokens(g, 0, 1, 'for this ')) // => ['for ', 'this ', 'graph ', 'hello '] + # show(unaligned_modify_tokens(g, 0, 2, '')) // => ['hello '] + # show(unaligned_modify_tokens(g, 0, 2, ' ')) // => [' hello '] + # show(unaligned_modify_tokens(g, 1, 3, ' ')) // => ['test '] + # show(unaligned_modify_tokens(g, 3, 3, ' !')) // => ['test ', 'graph ', 'hello ', '! '] + # show(unaligned_modify_tokens(init('a '), 0, 1, ' ')) // => [' '] + # ids(g) // => 't0 t1 t2' + # ids(unaligned_modify_tokens(g, 0, 0, 'this ')) // => 't3 t0 t1 t2' + # ids(unaligned_modify_tokens(g, 0, 1, 'this ')) // => 't3 t1 t2' + # ids(unaligned_modify_tokens(g, 0, 1, 'this')) // => 't3 t2' + # const showS = (g: Graph) => g.source.map(t => t.text) + # const idsS = (g: Graph) => g.source.map(t => t.id).join(' ') + # showS(unaligned_modify_tokens(g, 0, 0, 'this ', 'source')) // => ['this ', 'test ', 'graph ', 'hello '] + # idsS(unaligned_modify_tokens(g, 0, 0, 'this ', 'source')) // => 's3 s0 s1 s2' + + # Indexes are token offsets + """ # noqa: E501 + + if ( + from_ < 0 + or to < 0 + or from_ > len(g.get_side(side)) + or to > len(g.get_side(side)) + or from_ > to + ): + raise ValueError(f"Invalid coordinates {g} {from_} {to} {text}") + + if _ := ALL_WHITESPACE.fullmatch(text): + # replacement text is only whitespace: need to find some token to put it on + if from_ > 0: + return unaligned_modify_tokens( + g, from_ - 1, to, g.get_side(side)[from_ - 1].text + text, side + ) + elif to < len(g.get_side(side)): + return unaligned_modify_tokens( + g, from_, to + 1, text + g.get_side(side)[to].text, side + ) + + else: + logger.warn("Introducing whitespace into empty graph") + + if NO_WHITESPACE_AT_END.match(text[-1:]) is not None and to < len(g.get_side(side)): + # if replacement text does not end with whitespace, grab the next word as well + return unaligned_modify_tokens(g, from_, to + 1, text + g.get_side(side)[to].text, side) + + if from_ > 0 and from_ == len(g.get_side(side)) and to == len(g.get_side(side)): + # we're adding a word at the end but the last token might not end in whitespace: + # glue them together + + return unaligned_modify_tokens( + g, from_ - 1, to, g.get_side(side)[from_ - 1].text + text, side + ) + + id_offset = next_id(g) + + tokens = [ + Token(t, f"{side[0]}{(id_offset + i)}") for i, t in enumerate(token.tokenize(text)) + ] + + new_tokens, removed = lists.splice(g.get_side(side), from_, to - from_, *tokens) + + ids_removed = {t.id for t in removed} + + new_edge_ids = {t.id for t in tokens} + new_edge_labels = set() + new_edge_manual = False + + def fun(e: Edge, _id: str) -> bool: + if any(id_ in ids_removed for id_ in e.ids): + for id_ in e.ids: + if id_ not in ids_removed: + new_edge_ids.add(id_) + for lbl in e.labels: + new_edge_labels.add(lbl) + return False + return True + + edges = dicts.filter_dict(g.edges, fun) + + if new_edge_ids: + e = edge(list(new_edge_ids), list(new_edge_labels), manual=new_edge_manual) + edges[e.id] = e + + return g.copy_with_updated_side_and_edges(side, new_tokens, edges) + + +def unaligned_rearrange(g: Graph, begin: int, end: int, dest: int) -> Graph: + """Moves a slice of the target tokens and puts it at a new destination. + + target_text(unaligned_rearrange(init('apa bepa cepa depa'), 1, 2, 0)) // => 'bepa cepa apa depa ' + + Indexes are token offsets""" # noqa: E501 + em = edge_map(g) + edge_ids_to_update = {em[t.id].id for t in g.target[begin : (end + 1)]} + new_edges = {} + new_edges.update(g.edges) + for id_ in edge_ids_to_update: + new_edges[id_] = merge_edges(g.edges[id_], edge([], [], manual=True)) + return g.copy_with_updated_side_and_edges( + Side.target, lists.rearrange(g.target, begin, end, dest), new_edges + ) diff --git a/src/parallel_corpus/shared/__init__.py b/src/parallel_corpus/shared/__init__.py new file mode 100644 index 0000000..04e5599 --- /dev/null +++ b/src/parallel_corpus/shared/__init__.py @@ -0,0 +1,24 @@ +import re +from typing import List, TypeVar + +from . import diffs + +__all__ = ["diffs"] + + +ENDING_WHITESPACE = re.compile(r"\s$") + + +def end_with_space(s: str) -> str: + if not s: + return s + return f"{s} " if (ENDING_WHITESPACE.fullmatch(s[-1]) is None) else s + + +def uniq(xs: List[str]) -> List[str]: + used = set() + return [x for x in xs if x not in used and (used.add(x) or True)] # type: ignore [func-returns-value] + + +A = TypeVar("A") +B = TypeVar("B") diff --git a/src/parallel_corpus/shared/dicts.py b/src/parallel_corpus/shared/dicts.py new file mode 100644 index 0000000..3176a48 --- /dev/null +++ b/src/parallel_corpus/shared/dicts.py @@ -0,0 +1,26 @@ +from typing import TYPE_CHECKING, Callable, Dict, List, TypeVar + +if TYPE_CHECKING: + from _typeshed import SupportsRichComparison + + K = TypeVar("K", bound=SupportsRichComparison) +else: + K = TypeVar("K") + +A = TypeVar("A") +B = TypeVar("B") +V = TypeVar("V") + + +def modify(x: Dict[K, V], k: K, default: V, f: Callable[[V], V]) -> V: + x[k] = f(x.get(k) or default) + return x[k] + + +def traverse(x: Dict[K, A], k: Callable[[A, K], B], *, sort_keys: bool = False) -> List[B]: + ks = sorted(x.keys()) if sort_keys else x.keys() + return [k(x[i], i) for i in ks] + + +def filter_dict(x: Dict[K, A], k: Callable[[A, K], bool]) -> Dict[K, A]: + return {id_: a for id_, a in x.items() if k(a, id_)} diff --git a/src/parallel_corpus/shared/diffs.py b/src/parallel_corpus/shared/diffs.py new file mode 100644 index 0000000..56d55d0 --- /dev/null +++ b/src/parallel_corpus/shared/diffs.py @@ -0,0 +1,147 @@ +import enum +from typing import Callable, Dict, Generic, List, Optional, Tuple, TypeVar, Union + +import diff_match_patch as dmp_module +from typing_extensions import Self + +from parallel_corpus.shared.str_map import str_map + +dmp = dmp_module.diff_match_patch() + +A = TypeVar("A") +B = TypeVar("B") +C = TypeVar("C") + + +class ChangeType(enum.IntEnum): + DELETED = -1 + CONSTANT = 0 + INSERTED = 1 + + +class Change(Generic[A, B]): + def __init__(self, change: ChangeType, a: Optional[A] = None, b: Optional[B] = None): + if change == ChangeType.DELETED and a is None: + raise ValueError("`a` must be given for DELETED") + if change == ChangeType.CONSTANT and (a is None or b is None): + raise ValueError("both `a` and `b` must be given for CONSTANT") + if change == ChangeType.INSERTED and b is None: + raise ValueError("`b` must be given for INSERTED") + self.change = change + self.a = a + self.b = b + + @classmethod + def constant(cls, a: A, b: B) -> Self: + return cls(ChangeType.CONSTANT, a=a, b=b) + + @classmethod + def deleted(cls, a: A) -> Self: + return cls(ChangeType.DELETED, a=a) + + @classmethod + def inserted(cls, b: B) -> Self: + return cls(ChangeType.INSERTED, b=b) + + def model_dump(self) -> Dict[str, Union[int, A, B]]: + out: Dict[str, Union[int, A, B]] = { + "change": int(self.change), + } + if self.a is not None: + out["a"] = self.a + if self.b is not None: + out["b"] = self.b + return out + + def __eq__(self, other) -> bool: + if not isinstance(other, Change): + return NotImplemented + return self.change == other.change and self.a == other.a and self.b == other.b + + def __repr__(self) -> str: + return f"Change(change={self.change!r},a={self.a!r},b={self.b!r})" + + def __str__(self) -> str: + return f"Change(change={self.change},a={self.a},b={self.b})" + + +def char_stream(): + """Make a stream of all unicode characters + + We need this because the diff-match-patch library is hard-coded to work on characters. + + To make a polymorphic diff each unique element is assigned a unique character. + We translate them back to the opaque type after diffing via the characters. + This is used in `hdiff`. + + >>> chars = char_stream() + >>> assert ord(next(chars)) == 0 + >>> assert ord(next(chars)) == 1 + >>> assert ord(next(chars)) == 2 + >>> assert ord(next(chars)) == 3 + + """ + i = 0 + while True: + yield chr(int(str(i), base=16)) + i += 1 + + +def hdiff( # noqa: C901 + xs: List[A], + ys: List[B], + a_cmp: Callable[[A], str] = str, + b_cmp: Callable[[B], str] = str, +) -> List[Change[A, B]]: + to: Dict[str, str] = {} + a_from: Dict[str, List[A]] = {} + b_from: Dict[str, List[B]] = {} + chars = char_stream() + + def assign(c: C, c_cmp: Callable[[C], str], c_from: Dict[str, List[C]]) -> str: + s = c_cmp(c) + u = to.get(s) + if u is None: + u = next(chars) + to[s] = u + arr = c_from.get(u) + if not arr: + arr = [] + c_from[u] = arr + arr.append(c) + return u + + s1 = "".join((assign(a, a_cmp, a_from) for a in xs)) + s2 = "".join((assign(b, b_cmp, b_from) for b in ys)) + d = dmp.diff_main(s1, s2) + + def str_map_change(change: int) -> Callable[[str, int], Change]: + def inner(c: str, _: int) -> Change: + if change == 0: + a = a_from.get(c, []).pop(0) + b = b_from.get(c, []).pop(0) + return Change.constant(a, b) + if change == -1: + a = a_from.get(c, []).pop(0) + return Change.deleted(a) + if change == 1: + b = b_from.get(c, []).pop(0) + return Change.inserted(b) + raise RuntimeError("diff-match-patch change not in range [-1,1]") + + return inner + + def map_change(change: int, cs): + return str_map(cs, str_map_change(change)) + + out = [] + for changes in (map_change(change, cs) for change, cs in d): + # print(f"{changes=}") + out.extend(changes) + return out + + +def token_diff(s1: str, s2: str) -> List[Tuple[int, str]]: + d = dmp.diff_main(s1, s2) + dmp.diff_cleanupSemantic(d) + return d diff --git a/src/parallel_corpus/shared/functional.py b/src/parallel_corpus/shared/functional.py new file mode 100644 index 0000000..50a9d94 --- /dev/null +++ b/src/parallel_corpus/shared/functional.py @@ -0,0 +1,12 @@ +from typing import Callable, Sequence, TypeVar + +A = TypeVar("A") + + +def take_last_while(predicate: Callable[[A], bool], xs: Sequence[A]) -> Sequence[A]: + start = 0 + for e in reversed(xs): + if not predicate(e): + break + start -= 1 + return xs[start:] if start < 0 else xs[:0] diff --git a/src/parallel_corpus/shared/ids.py b/src/parallel_corpus/shared/ids.py new file mode 100644 index 0000000..aa0f58a --- /dev/null +++ b/src/parallel_corpus/shared/ids.py @@ -0,0 +1,21 @@ +import re +from typing import Iterable + +DIGITS = re.compile(r"\d+") + + +def next_id(xs: Iterable[str]) -> int: + """Calculate the next id to use from these identifiers + + next_id([]) // => 0 + next_id(['t1', 't2', 't3']) // => 4 + next_id(['u2v5k1', 'b3', 'a0']) // => 6 + next_id(['77j66']) // => 78 + + """ + curr_max = -1 + for x in xs: + for digit in DIGITS.finditer(x): + curr_max = max(curr_max, int(digit[0])) + # xs.forEach(x => (x.match(/\d+/g) || []).forEach(i => (max = Math.max(max, parseInt(i))))) + return curr_max + 1 diff --git a/src/parallel_corpus/shared/lists.py b/src/parallel_corpus/shared/lists.py new file mode 100644 index 0000000..ff44b20 --- /dev/null +++ b/src/parallel_corpus/shared/lists.py @@ -0,0 +1,43 @@ +import copy +from typing import List, Tuple, TypeVar + +A = TypeVar("A") + + +def rearrange(xs: List[A], begin: int, end: int, dest: int) -> List[A]: + """Moves a slice of the items and puts back them at some destination. + + rearrange([0, 1, 2, 3], 1, 2, 0) // => [1, 2, 0, 3] + rearrange([0, 1, 2, 3], 1, 2, 3) // => [0, 3, 1, 2] + + rearrange([0, 1, 2, 3], 1, 2, 1) // => [0, 1, 2, 3] + rearrange([0, 1, 2, 3], 1, 2, 2) // => [0, 1, 2, 3]""" + a, mid, z = split_at_3(xs, begin, end + 1) + w = end - begin + if dest > begin: + dest -= w + pre, post = split_at(a + z, dest) + return pre + mid + post + + +def splice(xs: List[A], start: int, count: int, *insert) -> Tuple[List[A], List[A]]: + ys = copy.deepcopy(xs) + zs = ys[start : (start + count)] + ys[start : (start + count)] = insert + return ys, zs + + +def split_at_3(xs: List[A], start: int, end: int) -> Tuple[List[A], List[A], List[A]]: + """Split an array into three pieces + + splitAt3('0123456'.split(''), 2, 4).map(xs => xs.join('')) // => ['01', '23', '456'] + splitAt3('0123456'.split(''), 2, 2).map(xs => xs.join('')) // => ['01', '', '23456'] + splitAt3('0123456'.split(''), 2, 9).map(xs => xs.join('')) // => ['01', '23456', ''] + splitAt3('0123456'.split(''), 0, 2).map(xs => xs.join('')) // => ['', '01', '23456']""" + ab, c = split_at(xs, end) + a, b = split_at(ab, start) + return a, b, c + + +def split_at(xs: List[A], index: int) -> Tuple[List[A], List[A]]: + return xs[:index], xs[index:] diff --git a/src/parallel_corpus/shared/ranges.py b/src/parallel_corpus/shared/ranges.py new file mode 100644 index 0000000..6945fb6 --- /dev/null +++ b/src/parallel_corpus/shared/ranges.py @@ -0,0 +1,46 @@ +import itertools +from typing import TypedDict + +from parallel_corpus.shared.diffs import token_diff +from parallel_corpus.shared.functional import take_last_while + +EditRange = TypedDict("EditRange", {"from": int, "to": int, "insert": str}) + + +def edit_range(s0: str, s: str) -> EditRange: + """ + >>> edit_range('0123456789', '0189') + {'from': 2, 'to': 8, 'insert': ''} + + >>> edit_range('0123456789', '01') + {'from': 2, 'to': 10, 'insert': ''} + + >>> edit_range('0123456789', '89') + {'from': 0, 'to': 8, 'insert': ''} + + >>> edit_range('0123456789', '') + {'from': 0, 'to': 10, 'insert': ''} + + >>> edit_range('0123456789', '01xyz89') + {'from': 2, 'to': 8, 'insert': 'xyz'} + + >>> edit_range('0123456789', '01xyz') + {'from': 2, 'to': 10, 'insert': 'xyz'} + + >>> edit_range('0123456789', 'xyz89') + {'from': 0, 'to': 8, 'insert': 'xyz'} + + >>> edit_range('0123456789', 'xyz') + {'from': 0, 'to': 10, 'insert': 'xyz'} + + >>> edit_range('', '01') + {'from': 0, 'to': 0, 'insert': '01'} + """ + patches = token_diff(s0, s) + pre = list(itertools.takewhile(lambda i: i[0] == 0, patches)) + post = take_last_while(lambda i: i[0] == 0, patches) + from_ = len("".join((i[1] for i in pre))) + postlen = len("".join((i[1] for i in post))) + to = len(s0) - postlen + insert = s[from_ : (len(s) - (len(s0) - to))] + return {"from": from_, "to": to, "insert": insert} diff --git a/src/parallel_corpus/shared/str_map.py b/src/parallel_corpus/shared/str_map.py new file mode 100644 index 0000000..d5b68b4 --- /dev/null +++ b/src/parallel_corpus/shared/str_map.py @@ -0,0 +1,7 @@ +from typing import Callable, List, TypeVar + +A = TypeVar("A") + + +def str_map(s: str, f: Callable[[str, int], A]) -> List[A]: + return [f(s[i], i) for i in range(len(s))] diff --git a/src/parallel_corpus/shared/union_find.py b/src/parallel_corpus/shared/union_find.py new file mode 100644 index 0000000..e201c5d --- /dev/null +++ b/src/parallel_corpus/shared/union_find.py @@ -0,0 +1,122 @@ +import abc +import functools +import json +from dataclasses import dataclass +from typing import Callable, Dict, Generic, List, Optional, Tuple, TypeVar + +from typing_extensions import Self + +A = TypeVar("A") + + +class UnionFindOperations(abc.ABC, Generic[A]): + """Union-find data structure operations""" + + @abc.abstractmethod + def find(self, x: A) -> A: + """What group does this belong to?""" + + @abc.abstractmethod + def union(self, x: A, y: A) -> A: + """Make these belong to the same group.""" + + @abc.abstractmethod + def unions(self, xs: List[A]) -> None: + """Make these belong to the same group.""" + + +class UnionFind(UnionFindOperations[int]): + def __init__(self, *, rev: Optional[List[int]] = None) -> None: + self._rev: List[int] = rev or [] + + def find(self, x: int) -> int: + while x >= len(self._rev): + self._rev.append(None) # type: ignore [arg-type] + if self._rev[x] is None: + self._rev[x] = x + elif self._rev[x] != x: + self._rev[x] = self.find(self._rev[x]) # type: ignore [arg-type] + return self._rev[x] # type: ignore [return-value] + + def union(self, x: int, y: int) -> int: + find_x = self.find(x) + find_y = self.find(y) + if find_x != find_y: + self._rev[find_y] = find_x + return find_x + + def unions(self, xs: List[int]) -> None: + functools.reduce(self.union, xs, xs[0]) + + +@dataclass +class Renumber(Generic[A]): + bw: Dict[str, int] + fw: Dict[int, A] + i = 0 + serialize: Callable[[A], str] + + def num(self, a: A) -> int: + s = self.serialize(a) + if s not in self.bw: + self.fw[self.i] = a + self.bw[s] = self.i + self.i += 1 + return self.bw[s] + + def un(self, n: int) -> Optional[A]: + return self.fw.get(n) + + @classmethod + def init(cls, serialize: Callable[[A], str] = json.dumps) -> Self: + return cls(bw={}, fw={}, serialize=serialize) + + +def renumber( + serialize: Callable[[A], str] = json.dumps, +) -> Tuple[Callable[[int], Optional[A]], Callable[[A], int]]: + """ + Assign unique numbers to each distinct element + + const {un, num} = Renumber() + num('foo') // => 0 + num('bar') // => 1 + num('foo') // => 0 + un(0) // => 'foo' + un(1) // => 'bar' + un(2) // => undefined + + const {un, num} = Renumber(a => a.toLowerCase()) + num('foo') // => 0 + num('FOO') // => 0 + un(0) // => 'foo' + """ + renum: Renumber[A] = Renumber(bw={}, fw={}, serialize=serialize) + + return renum.un, renum.num + + +@dataclass +class PolyUnionFind(Generic[A]): + _uf: UnionFind + _renum: Renumber[A] + + def repr(self, x: A) -> int: + return self._uf.find(self._renum.num(x)) + + def find(self, x: A) -> Optional[A]: + return self._renum.un(self._uf.find(self._renum.num(x))) + + def union(self, x: A, y: A) -> Optional[A]: + return self._renum.un(self._uf.union(self._renum.num(x), self._renum.num(y))) + + def unions(self, xs: List[A]) -> None: + num_xs_0 = self._renum.num(xs[0]) + for x in xs[1:]: + self._uf.union(num_xs_0, self._renum.num(x)) + + +def poly_union_find(serialize: Callable[[str], str]) -> PolyUnionFind: + renum = Renumber.init(serialize) + uf = UnionFind() + return PolyUnionFind(_uf=uf, _renum=renum) diff --git a/src/parallel_corpus/shared/unique_check.py b/src/parallel_corpus/shared/unique_check.py new file mode 100644 index 0000000..be6b0d2 --- /dev/null +++ b/src/parallel_corpus/shared/unique_check.py @@ -0,0 +1,61 @@ +from typing import Dict, Generic, TypeVar + +S = TypeVar("S") + + +class UniqueCheck(Generic[S]): + """ + >>> u = UniqueCheck() + >>> u(1) + True + >>> u(1) + False + >>> u(1) + False + >>> u(2) + True + >>> u(3) + True + >>> u(2) + False + """ + + def __init__(self) -> None: + self.c: Count[S] = Count() + + def __call__(self, s: S) -> bool: + return self.c.inc(s) == 1 + + +class Count(Generic[S]): + """ + >>> u = Count() + >>> u.inc(1) + 1 + >>> u.inc(1) + 2 + >>> u.inc(1) + 3 + >>> u.inc(2) + 1 + >>> u.inc(3) + 1 + >>> u.inc(2) + 2 + >>> u.get(1) + 3 + >>> u.get(2) + 2 + >>> u.get(3) + 1 + """ + + def __init__(self) -> None: + self.m: Dict[S, int] = {} + + def get(self, s: S) -> int: + return self.m.get(s) or 0 + + def inc(self, s: S) -> int: + self.m[s] = self.get(s) + 1 + return self.get(s) diff --git a/src/parallel_corpus/source_target.py b/src/parallel_corpus/source_target.py new file mode 100644 index 0000000..f8c2dd2 --- /dev/null +++ b/src/parallel_corpus/source_target.py @@ -0,0 +1,27 @@ +from dataclasses import dataclass +from typing import Callable, Generic, TypeVar + +# Used to support StrEnum in python 3.8 and 3.9 +# Not drop-in of StrEnum in python 3.11 +import strenum + +A = TypeVar("A") +B = TypeVar("B") + + +class Side(strenum.StrEnum): + source = "source" + target = "target" + + +@dataclass +class SourceTarget(Generic[A]): + source: A + target: A + + def get_side(self, side: Side) -> A: + return self.source if side == Side.source else self.target + + +def map_sides(g: SourceTarget[A], f: Callable[[A, Side], B]) -> SourceTarget[B]: + return SourceTarget(source=f(g.source, Side.source), target=f(g.target, Side.target)) diff --git a/src/parallel_corpus/token.py b/src/parallel_corpus/token.py new file mode 100644 index 0000000..ff6e32c --- /dev/null +++ b/src/parallel_corpus/token.py @@ -0,0 +1,86 @@ +import re +from dataclasses import dataclass +from typing import List, Sequence, TypedDict + +from parallel_corpus import shared + + +@dataclass +class Text: + text: str + + +@dataclass +class Token(Text): + id: str + + +@dataclass +class Span: + begin: int + end: int + + +def text(ts: Sequence[Text]) -> str: + """The text in some tokens + + >>> text(identify(tokenize('apa bepa cepa '), '#')) + 'apa bepa cepa ' + + """ + return "".join(texts(ts)) + + +def texts(ts: Sequence[Text]) -> List[str]: + """The texts in some tokens + + >>> texts(identify(tokenize('apa bepa cepa '), '#')) + ['apa ', 'bepa ', 'cepa '] + """ + return [t.text for t in ts] + + +def tokenize(s: str) -> List[str]: + """Tokenizes text on whitespace, prefers to have trailing whitespace.""" + return list( + map( + shared.end_with_space, + re.findall(r"\s*\S+\s*", s) or re.findall(r"^\s+$", s) or [], + ) + ) + + +def identify(toks: List[str], prefix: str) -> List[Token]: + return [Token(text=text, id=f"{prefix}{i}") for i, text in enumerate(toks)] + + +class TokenAt(TypedDict): + token: int + offset: int + + +def token_at(tokens: List[str], character_offset: int) -> TokenAt: + """ + >>> abc = ['012', '3456', '789'] + >>> token_at(abc, 0) + {'token': 0, 'offset': 0} + + >>> token_at(abc, 2) + {'token': 0, 'offset': 2} + + token_at(abc, 3) // => {token: 1, offset: 0} + token_at(abc, 6) // => {token: 1, offset: 3} + token_at(abc, 7) // => {token: 2, offset: 0} + token_at(abc, 9) // => {token: 2, offset: 2} + token_at(abc, 10) // => {token: 3, offset: 0} + Utils.throws(() => token_at(abc, 11)) // => true + """ + passed = 0 + for i in range(len(tokens)): + w = len(tokens[i]) + passed += w + if passed > character_offset: + return {"token": i, "offset": character_offset - passed + w} + if character_offset == len("".join(tokens)): + return {"token": len(tokens), "offset": 0} + raise IndexError(f"Out of bounds: tokens={tokens}, character_offset={character_offset}") diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/__snapshots__/test_graph.ambr b/tests/__snapshots__/test_graph.ambr new file mode 100644 index 0000000..1caef9b --- /dev/null +++ b/tests/__snapshots__/test_graph.ambr @@ -0,0 +1,156 @@ +# name: test_unaligned_modify[0-0-new] + list([ + 'newtest ', + 'graph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify[0-1-new] + list([ + 'newest ', + 'graph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify[0-15-_] + list([ + '_o ', + ]) +# --- +# name: test_unaligned_modify[0-16-_] + list([ + '_ ', + ]) +# --- +# name: test_unaligned_modify[0-17-_] + list([ + '_ ', + ]) +# --- +# name: test_unaligned_modify[0-5-new ] + list([ + 'new ', + 'graph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify[0-5-new] + list([ + 'newgraph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify[16-16- !] + list([ + 'test ', + 'graph ', + 'hello ', + '! ', + ]) +# --- +# name: test_unaligned_modify[5-5- ] + list([ + 'test ', + ' graph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify[5-6- ] + list([ + 'test ', + ' raph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify_tokens_ids[0-0-this ] + 't3 t0 t1 t2' +# --- +# name: test_unaligned_modify_tokens_ids[0-1-this ] + 't3 t1 t2' +# --- +# name: test_unaligned_modify_tokens_ids[0-1-this] + 't3 t2' +# --- +# name: test_unaligned_modify_tokens_ids_source[0-0-this ] + 's3 s0 s1 s2' +# --- +# name: test_unaligned_modify_tokens_show[0-0-this ] + list([ + 'this ', + 'test ', + 'graph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify_tokens_show[0-1- white ] + list([ + ' white ', + 'graph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify_tokens_show[0-1-for this ] + list([ + 'for ', + 'this ', + 'graph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify_tokens_show[0-1-this ] + list([ + 'this ', + 'graph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify_tokens_show[0-1-this] + list([ + 'thisgraph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify_tokens_show[0-2- ] + list([ + ' hello ', + ]) +# --- +# name: test_unaligned_modify_tokens_show[0-2-] + list([ + 'hello ', + ]) +# --- +# name: test_unaligned_modify_tokens_show[1-2- graph ] + list([ + 'test ', + ' graph ', + 'hello ', + ]) +# --- +# name: test_unaligned_modify_tokens_show[1-2-graph] + list([ + 'test ', + 'graphhello ', + ]) +# --- +# name: test_unaligned_modify_tokens_show[1-3- ] + list([ + 'test ', + ]) +# --- +# name: test_unaligned_modify_tokens_show[3-3- !] + list([ + 'test ', + 'graph ', + 'hello ', + '! ', + ]) +# --- +# name: test_unaligned_modify_tokens_show_source[0-0-this ] + list([ + 'this ', + 'test ', + 'graph ', + 'hello ', + ]) +# --- diff --git a/tests/__snapshots__/test_token.ambr b/tests/__snapshots__/test_token.ambr new file mode 100644 index 0000000..b71427c --- /dev/null +++ b/tests/__snapshots__/test_token.ambr @@ -0,0 +1,35 @@ +# name: test_tokenize[ -expected2] + list([ + ' ', + ]) +# --- +# name: test_tokenize[ apa bepa cepa -expected5] + list([ + ' apa ', + 'bepa ', + 'cepa ', + ]) +# --- +# name: test_tokenize[ apa bepa cepa-expected4] + list([ + ' apa ', + 'bepa ', + 'cepa ', + ]) +# --- +# name: test_tokenize[ -expected1] + list([ + ' ', + ]) +# --- +# name: test_tokenize[-expected0] + list([ + ]) +# --- +# name: test_tokenize[apa bepa cepa-expected3] + list([ + 'apa ', + 'bepa ', + 'cepa ', + ]) +# --- diff --git a/tests/requirements-testing.lock b/tests/requirements-testing.lock new file mode 100644 index 0000000..dcf1a6d --- /dev/null +++ b/tests/requirements-testing.lock @@ -0,0 +1,146 @@ +# This file is @generated by PDM. +# Please do not edit it manually. + +colorama==0.4.6; sys_platform == "win32" \ + --hash=sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44 \ + --hash=sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6 +colored==1.4.4 \ + --hash=sha256:04ff4d4dd514274fe3b99a21bb52fb96f2688c01e93fba7bef37221e7cb56ce0 +coverage==7.5.0 \ + --hash=sha256:075299460948cd12722a970c7eae43d25d37989da682997687b34ae6b87c0ef0 \ + --hash=sha256:07dfdd492d645eea1bd70fb1d6febdcf47db178b0d99161d8e4eed18e7f62fe7 \ + --hash=sha256:0cbdf2cae14a06827bec50bd58e49249452d211d9caddd8bd80e35b53cb04631 \ + --hash=sha256:2055c4fb9a6ff624253d432aa471a37202cd8f458c033d6d989be4499aed037b \ + --hash=sha256:262fffc1f6c1a26125d5d573e1ec379285a3723363f3bd9c83923c9593a2ac25 \ + --hash=sha256:280132aada3bc2f0fac939a5771db4fbb84f245cb35b94fae4994d4c1f80dae7 \ + --hash=sha256:2b57780b51084d5223eee7b59f0d4911c31c16ee5aa12737c7a02455829ff067 \ + --hash=sha256:2bd7065249703cbeb6d4ce679c734bef0ee69baa7bff9724361ada04a15b7e3b \ + --hash=sha256:3235d7c781232e525b0761730e052388a01548bd7f67d0067a253887c6e8df46 \ + --hash=sha256:33c020d3322662e74bc507fb11488773a96894aa82a622c35a5a28673c0c26f5 \ + --hash=sha256:357754dcdfd811462a725e7501a9b4556388e8ecf66e79df6f4b988fa3d0b39a \ + --hash=sha256:39793731182c4be939b4be0cdecde074b833f6171313cf53481f869937129ed3 \ + --hash=sha256:3c2b77f295edb9fcdb6a250f83e6481c679335ca7e6e4a955e4290350f2d22a4 \ + --hash=sha256:41327143c5b1d715f5f98a397608f90ab9ebba606ae4e6f3389c2145410c52b1 \ + --hash=sha256:427e1e627b0963ac02d7c8730ca6d935df10280d230508c0ba059505e9233475 \ + --hash=sha256:432949a32c3e3f820af808db1833d6d1631664d53dd3ce487aa25d574e18ad1c \ + --hash=sha256:4ba01d9ba112b55bfa4b24808ec431197bb34f09f66f7cb4fd0258ff9d3711b1 \ + --hash=sha256:4d0e206259b73af35c4ec1319fd04003776e11e859936658cb6ceffdeba0f5be \ + --hash=sha256:51431d0abbed3a868e967f8257c5faf283d41ec882f58413cf295a389bb22e58 \ + --hash=sha256:565b2e82d0968c977e0b0f7cbf25fd06d78d4856289abc79694c8edcce6eb2de \ + --hash=sha256:6782cd6216fab5a83216cc39f13ebe30adfac2fa72688c5a4d8d180cd52e8f6a \ + --hash=sha256:6afd2e84e7da40fe23ca588379f815fb6dbbb1b757c883935ed11647205111cb \ + --hash=sha256:710c62b6e35a9a766b99b15cdc56d5aeda0914edae8bb467e9c355f75d14ee95 \ + --hash=sha256:84921b10aeb2dd453247fd10de22907984eaf80901b578a5cf0bb1e279a587cb \ + --hash=sha256:85a5dbe1ba1bf38d6c63b6d2c42132d45cbee6d9f0c51b52c59aa4afba057517 \ + --hash=sha256:9c6384cc90e37cfb60435bbbe0488444e54b98700f727f16f64d8bfda0b84656 \ + --hash=sha256:9dd88fce54abbdbf4c42fb1fea0e498973d07816f24c0e27a1ecaf91883ce69e \ + --hash=sha256:a81eb64feded34f40c8986869a2f764f0fe2db58c0530d3a4afbcde50f314880 \ + --hash=sha256:a898c11dca8f8c97b467138004a30133974aacd572818c383596f8d5b2eb04a9 \ + --hash=sha256:a9960dd1891b2ddf13a7fe45339cd59ecee3abb6b8326d8b932d0c5da208104f \ + --hash=sha256:a9a7ef30a1b02547c1b23fa9a5564f03c9982fc71eb2ecb7f98c96d7a0db5cf2 \ + --hash=sha256:ad97ec0da94b378e593ef532b980c15e377df9b9608c7c6da3506953182398af \ + --hash=sha256:adf032b6c105881f9d77fa17d9eebe0ad1f9bfb2ad25777811f97c5362aa07f2 \ + --hash=sha256:bbfe6389c5522b99768a93d89aca52ef92310a96b99782973b9d11e80511f932 \ + --hash=sha256:bd4bacd62aa2f1a1627352fe68885d6ee694bdaebb16038b6e680f2924a9b2cc \ + --hash=sha256:bf0b4b8d9caa8d64df838e0f8dcf68fb570c5733b726d1494b87f3da85db3a2d \ + --hash=sha256:c379cdd3efc0658e652a14112d51a7668f6bfca7445c5a10dee7eabecabba19d \ + --hash=sha256:c58536f6892559e030e6924896a44098bc1290663ea12532c78cef71d0df8493 \ + --hash=sha256:cbe6581fcff7c8e262eb574244f81f5faaea539e712a058e6707a9d272fe5b64 \ + --hash=sha256:ced268e82af993d7801a9db2dbc1d2322e786c5dc76295d8e89473d46c6b84d4 \ + --hash=sha256:cf3539007202ebfe03923128fedfdd245db5860a36810136ad95a564a2fdffff \ + --hash=sha256:cf62d17310f34084c59c01e027259076479128d11e4661bb6c9acb38c5e19bb8 \ + --hash=sha256:d0194d654e360b3e6cc9b774e83235bae6b9b2cac3be09040880bb0e8a88f4a1 \ + --hash=sha256:d3d117890b6eee85887b1eed41eefe2e598ad6e40523d9f94c4c4b213258e4a4 \ + --hash=sha256:db2de4e546f0ec4b2787d625e0b16b78e99c3e21bc1722b4977c0dddf11ca84e \ + --hash=sha256:e768d870801f68c74c2b669fc909839660180c366501d4cc4b87efd6b0eee375 \ + --hash=sha256:e7c211f25777746d468d76f11719e64acb40eed410d81c26cefac641975beb88 \ + --hash=sha256:eed462b4541c540d63ab57b3fc69e7d8c84d5957668854ee4e408b50e92ce26a \ + --hash=sha256:f0bfe42523893c188e9616d853c47685e1c575fe25f737adf473d0405dcfa7eb \ + --hash=sha256:f609ebcb0242d84b7adeee2b06c11a2ddaec5464d21888b2c8255f5fd6a98ae4 \ + --hash=sha256:fea9d3ca80bcf17edb2c08a4704259dadac196fe5e9274067e7a20511fad1743 \ + --hash=sha256:fed7a72d54bd52f4aeb6c6e951f363903bd7d70bc1cad64dd1f087980d309ab9 +diff-match-patch==20230430 \ + --hash=sha256:953019cdb9c9d2c9e47b5b12bcff3cf4746fc4598eb406076fa1fc27e6a1f15c \ + --hash=sha256:dce43505fb7b1b317de7195579388df0746d90db07015ed47a85e5e44930ef93 +exceptiongroup==1.2.1; python_version < "3.11" \ + --hash=sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad \ + --hash=sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16 +iniconfig==2.0.0 \ + --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \ + --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374 +more-itertools==10.2.0 \ + --hash=sha256:686b06abe565edfab151cb8fd385a05651e1fdf8f0a14191e4439283421f8684 \ + --hash=sha256:8fccb480c43d3e99a00087634c06dd02b0d50fbf088b380de5a41a015ec239e1 +mypy==1.10.0 \ + --hash=sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061 \ + --hash=sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99 \ + --hash=sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de \ + --hash=sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a \ + --hash=sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9 \ + --hash=sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec \ + --hash=sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1 \ + --hash=sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131 \ + --hash=sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f \ + --hash=sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821 \ + --hash=sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5 \ + --hash=sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee \ + --hash=sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e \ + --hash=sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746 \ + --hash=sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2 \ + --hash=sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0 \ + --hash=sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b \ + --hash=sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53 \ + --hash=sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30 \ + --hash=sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda \ + --hash=sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051 \ + --hash=sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2 \ + --hash=sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7 \ + --hash=sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee \ + --hash=sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727 \ + --hash=sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976 \ + --hash=sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4 +mypy-extensions==1.0.0 \ + --hash=sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d \ + --hash=sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782 +packaging==24.0 \ + --hash=sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5 \ + --hash=sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9 +pluggy==1.5.0 \ + --hash=sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1 \ + --hash=sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669 +pytest==7.4.4 \ + --hash=sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280 \ + --hash=sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8 +pytest-cov==5.0.0 \ + --hash=sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652 \ + --hash=sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857 +ruff==0.4.2 \ + --hash=sha256:0e2e06459042ac841ed510196c350ba35a9b24a643e23db60d79b2db92af0c2b \ + --hash=sha256:1f32cadf44c2020e75e0c56c3408ed1d32c024766bd41aedef92aa3ca28eef68 \ + --hash=sha256:22e306bf15e09af45ca812bc42fa59b628646fa7c26072555f278994890bc7ac \ + --hash=sha256:24016ed18db3dc9786af103ff49c03bdf408ea253f3cb9e3638f39ac9cf2d483 \ + --hash=sha256:33bcc160aee2520664bc0859cfeaebc84bb7323becff3f303b8f1f2d81cb4edc \ + --hash=sha256:3afabaf7ba8e9c485a14ad8f4122feff6b2b93cc53cd4dad2fd24ae35112d5c5 \ + --hash=sha256:5ec481661fb2fd88a5d6cf1f83403d388ec90f9daaa36e40e2c003de66751798 \ + --hash=sha256:652e4ba553e421a6dc2a6d4868bc3b3881311702633eb3672f9f244ded8908cd \ + --hash=sha256:6a2243f8f434e487c2a010c7252150b1fdf019035130f41b77626f5655c9ca22 \ + --hash=sha256:6ab165ef5d72392b4ebb85a8b0fbd321f69832a632e07a74794c0e598e7a8376 \ + --hash=sha256:7891ee376770ac094da3ad40c116258a381b86c7352552788377c6eb16d784fe \ + --hash=sha256:799eb468ea6bc54b95527143a4ceaf970d5aa3613050c6cff54c85fda3fde480 \ + --hash=sha256:82986bb77ad83a1719c90b9528a9dd663c9206f7c0ab69282af8223566a0c34e \ + --hash=sha256:8772130a063f3eebdf7095da00c0b9898bd1774c43b336272c3e98667d4fb8fa \ + --hash=sha256:8d14dc8953f8af7e003a485ef560bbefa5f8cc1ad994eebb5b12136049bbccc5 \ + --hash=sha256:cbd1e87c71bca14792948c4ccb51ee61c3296e164019d2d484f3eaa2d360dfaf \ + --hash=sha256:ec4ba9436a51527fb6931a8839af4c36a5481f8c19e8f5e42c2f7ad3a49f5069 +strenum==0.4.15 \ + --hash=sha256:878fb5ab705442070e4dd1929bb5e2249511c0bcf2b0eeacf3bcd80875c82eff \ + --hash=sha256:a30cda4af7cc6b5bf52c8055bc4bf4b2b6b14a93b574626da33df53cf7740659 +syrupy==3.0.6 \ + --hash=sha256:583aa5ca691305c27902c3e29a1ce9da50ff9ab5f184c54b1dc124a16e4a6cf4 \ + --hash=sha256:9c18e22264026b34239bcc87ab7cc8d893eb17236ea7dae634217ea4f22a848d +tomli==2.0.1; python_version < "3.11" \ + --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ + --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f +typing-extensions==4.11.0 \ + --hash=sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0 \ + --hash=sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a diff --git a/tests/test_graph.py b/tests/test_graph.py new file mode 100644 index 0000000..8726e4b --- /dev/null +++ b/tests/test_graph.py @@ -0,0 +1,257 @@ +from typing import List + +import pytest +from parallel_corpus import graph, token +from parallel_corpus.source_target import Side + + +def test_graph_init() -> None: + g = graph.init("w1 w2") + source = [token.Token(text="w1 ", id="s0"), token.Token(text="w2 ", id="s1")] + target = [token.Token(text="w1 ", id="t0"), token.Token(text="w2 ", id="t1")] + edges = graph.edge_record([graph.edge(["s0", "t0"], []), graph.edge(["s1", "t1"], [])]) + + assert g.source == source + assert g.target == target + assert g.edges == edges + + +def test_graph_case1() -> None: + first = "Jonathan saknades , emedan han , med sin vapendragare , redan på annat håll sökt och anträffat fienden ." # noqa: E501 + second = "Jonat han saknades , emedan han , med sin vapendragare , redan på annat håll sökt och anträffat fienden ." # noqa: E501 + + g = graph.init(first) + + gm = graph.set_target(g, second) + print(f"{gm=}") + assert "e-s0-t19-t20" in gm.edges + + +def test_graph_case2() -> None: + first = "Jonat han saknades , emedan han , med sin vapendragare , redan på annat håll sökt och anträffat fienden ." # noqa: E501 + second = "Jonathan saknaes , emedan han , med sin vapendragare , redan på annat håll sökt och anträffat fienden ." # noqa: E501 + + g = graph.init(first) + + gm = graph.set_target(g, second) + print(f"{gm=}") + assert "e-s0-s1-t20" in gm.edges + + +def test_unaligned_set_side() -> None: + g0 = graph.init("a bc d") + print(">>> test_unaligned_set_side") + g = graph.unaligned_set_side(g0, Side.target, "ab c d") + print("<<< test_unaligned_set_side") + + expected_source = [ + token.Token(id="s0", text="a "), + token.Token(id="s1", text="bc "), + token.Token(id="s2", text="d "), + ] + expected_g0_target = [ + token.Token(id="t0", text="a "), + token.Token(id="t1", text="bc "), + token.Token(id="t2", text="d "), + ] + expected_g_target = [ + token.Token(id="t3", text="ab "), + token.Token(id="t4", text="c "), + token.Token(id="t5", text="d "), + ] + expected_g_edges = { + "e-s0-s1-s2-t3-t4-t5": graph.Edge( + id="e-s0-s1-s2-t3-t4-t5", + ids=["s0", "s1", "s2", "t3", "t4", "t5"], + labels=[], + manual=False, + ), + } + + assert g0.source == expected_source + assert g0.target == expected_g0_target + assert g.source == expected_source + assert g.target == expected_g_target + assert g.edges == expected_g_edges + + +def test_graph_align() -> None: + g0 = graph.init("a bc d") + + g = graph.unaligned_set_side(g0, Side.target, "ab c d") + + expected_source = [ + token.Token(id="s0", text="a "), + token.Token(id="s1", text="bc "), + token.Token(id="s2", text="d "), + ] + expected_g0_target = [ + token.Token(id="t0", text="a "), + token.Token(id="t1", text="bc "), + token.Token(id="t2", text="d "), + ] + expected_g_target = [ + token.Token(id="t3", text="ab "), + token.Token(id="t4", text="c "), + token.Token(id="t5", text="d "), + ] + expected_g_edges = { + "e-s0-s1-s2-t3-t4-t5": graph.Edge( + id="e-s0-s1-s2-t3-t4-t5", + ids=["s0", "s1", "s2", "t3", "t4", "t5"], + labels=[], + manual=False, + ), + } + expected_g_aligned_edges = { + "e-s0-s1-t3-t4": graph.Edge( + id="e-s0-s1-t3-t4", ids=["s0", "s1", "t3", "t4"], labels=[], manual=False + ), + "e-s2-t5": graph.Edge(id="e-s2-t5", ids=["s2", "t5"], labels=[], manual=False), + } + + assert g0.source == expected_source + assert g0.target == expected_g0_target + assert g.source == expected_source + assert g.target == expected_g_target + assert g.edges == expected_g_edges + g_aligned = graph.align(g) + assert g_aligned.source == expected_source + assert g_aligned.target == expected_g_target + assert g_aligned.edges == expected_g_aligned_edges + assert len(g_aligned.edges) == 2 + + +def show(g: graph.Graph) -> List[str]: + return [t.text for t in g.target] + + +def show_source(g: graph.Graph) -> List[str]: + return [s.text for s in g.source] + + +def ids(g: graph.Graph) -> str: + return " ".join((t.id for t in g.target)) + + +def ids_source(g: graph.Graph) -> str: + return " ".join((s.id for s in g.source)) + + +@pytest.mark.parametrize( + "i0, i1, word", + [ + (0, 0, "new"), + (0, 1, "new"), + (0, 5, "new "), + (0, 5, "new"), + (5, 5, " "), + (5, 6, " "), + (0, 15, "_"), + (0, 16, "_"), + (0, 17, "_"), + (16, 16, " !"), + ], +) +def test_unaligned_modify(i0: int, i1: int, word: str, snapshot): + g = graph.init("test graph hello") + assert g is not None + assert show(graph.unaligned_modify(g, i0, i1, word)) == snapshot + + +def test_edge_map() -> None: + g = graph.init("w") + e = graph.edge(["s0", "t0"], []) + print(f"{graph.edge_map(g)=}") + lhs = list(graph.edge_map(g).items()) + rhs = [("s0", e), ("t0", e)] + assert lhs == rhs + + +def test_unaligned_modify_tokens() -> None: + g = graph.init("test graph hello") + assert show(g) == ["test ", "graph ", "hello "] + assert ids(g) == "t0 t1 t2" + + +@pytest.mark.parametrize("text, expected", [("this", True), ("this ", False)]) +def test_no_whitespace_at_end(text: str, *, expected: bool) -> None: + assert (graph.NO_WHITESPACE_AT_END.match(text[-1:]) is not None) is expected + + +@pytest.mark.parametrize( + "from_, to, text", + [ + (0, 0, "this "), + (0, 1, "this "), + (0, 1, " white "), + (0, 1, "this"), + (1, 2, "graph"), + (1, 2, " graph "), + (0, 1, "for this "), + (0, 2, ""), + (0, 2, " "), + (1, 3, " "), + (3, 3, " !"), + ], +) +def test_unaligned_modify_tokens_show(from_: int, to: int, text: str, snapshot) -> None: + g = graph.init("test graph hello") + assert show(graph.unaligned_modify_tokens(g, from_, to, text)) == snapshot + + +@pytest.mark.parametrize( + "from_, to, text", + [ + (0, 0, "this "), + (0, 1, "this "), + (0, 1, "this"), + ], +) +def test_unaligned_modify_tokens_ids(from_: int, to: int, text: str, snapshot) -> None: + g = graph.init("test graph hello") + assert ids(graph.unaligned_modify_tokens(g, from_, to, text)) == snapshot + + +@pytest.mark.parametrize( + "from_, to, text", + [ + (0, 0, "this "), + ], +) +def test_unaligned_modify_tokens_show_source(from_: int, to: int, text: str, snapshot) -> None: + g = graph.init("test graph hello") + assert ( + show_source(graph.unaligned_modify_tokens(g, from_, to, text, Side.source)) == snapshot + ) + + +@pytest.mark.parametrize( + "from_, to, text", + [ + (0, 0, "this "), + ], +) +def test_unaligned_modify_tokens_ids_source(from_: int, to: int, text: str, snapshot) -> None: + g = graph.init("test graph hello") + assert ids_source(graph.unaligned_modify_tokens(g, from_, to, text, Side.source)) == snapshot + + +# show(unaligned_modify_tokens(init('a '), 0, 1, ' ')) // => [' '] +# ids(g) // => 't0 t1 t2' +# ids(unaligned_modify_tokens(g, 0, 0, 'this ')) // => 't3 t0 t1 t2' +# ids(unaligned_modify_tokens(g, 0, 1, 'this ')) // => 't3 t1 t2' +# ids(unaligned_modify_tokens(g, 0, 1, 'this')) // => 't3 t2' +# const showS = (g: Graph) => g.source.map(t => t.text) +# const idsS = (g: Graph) => g.source.map(t => t.id).join(' ') +# showS(unaligned_modify_tokens(g, 0, 0, 'this ', 'source')) // => ['this ', 'test ', 'graph ', 'hello '] # noqa: E501 +# idsS(unaligned_modify_tokens(g, 0, 0, 'this ', 'source')) // => 's3 s0 s1 s2' + + +def test_unaligned_rearrange() -> None: + g = graph.init("apa bepa cepa depa") + gr = graph.unaligned_rearrange(g, 1, 2, 0) + assert graph.target_text(gr) == "bepa cepa apa depa " # type: ignore [arg-type] + + +# target_text(unaligned_rearrange(init(), 1, 2, 0)) // => diff --git a/tests/test_shared/__snapshots__/test_ranges.ambr b/tests/test_shared/__snapshots__/test_ranges.ambr new file mode 100644 index 0000000..55f2833 --- /dev/null +++ b/tests/test_shared/__snapshots__/test_ranges.ambr @@ -0,0 +1,63 @@ +# name: test_edit_range[-01] + dict({ + 'from': 0, + 'insert': '01', + 'to': 0, + }) +# --- +# name: test_edit_range[0123456789-0189] + dict({ + 'from': 2, + 'insert': '', + 'to': 8, + }) +# --- +# name: test_edit_range[0123456789-01] + dict({ + 'from': 2, + 'insert': '', + 'to': 10, + }) +# --- +# name: test_edit_range[0123456789-01xyz89] + dict({ + 'from': 2, + 'insert': 'xyz', + 'to': 8, + }) +# --- +# name: test_edit_range[0123456789-01xyz] + dict({ + 'from': 2, + 'insert': 'xyz', + 'to': 10, + }) +# --- +# name: test_edit_range[0123456789-89] + dict({ + 'from': 0, + 'insert': '', + 'to': 8, + }) +# --- +# name: test_edit_range[0123456789-] + dict({ + 'from': 0, + 'insert': '', + 'to': 10, + }) +# --- +# name: test_edit_range[0123456789-xyz89] + dict({ + 'from': 0, + 'insert': 'xyz', + 'to': 8, + }) +# --- +# name: test_edit_range[0123456789-xyz] + dict({ + 'from': 0, + 'insert': 'xyz', + 'to': 10, + }) +# --- diff --git a/tests/test_shared/test_diffs.py b/tests/test_shared/test_diffs.py new file mode 100644 index 0000000..754b8f0 --- /dev/null +++ b/tests/test_shared/test_diffs.py @@ -0,0 +1,17 @@ +from parallel_corpus.shared.diffs import Change, hdiff + + +def test_hdiff() -> None: + (*abcca,) = "abcca" # type: ignore + (*BACC,) = "BACC" # type: ignore + + expected = [ + Change.deleted("a"), + Change.constant("b", "B"), + Change.inserted("A"), + Change.constant("c", "C"), + Change.constant("c", "C"), + Change.deleted("a"), + ] + + assert hdiff(abcca, BACC, str.lower, str.lower) == expected # type: ignore [has-type] diff --git a/tests/test_shared/test_functional.py b/tests/test_shared/test_functional.py new file mode 100644 index 0000000..149f850 --- /dev/null +++ b/tests/test_shared/test_functional.py @@ -0,0 +1,15 @@ +from parallel_corpus.shared import functional + + +def test_take_last_while_list() -> None: + source = [1, 2, 3, 4] + assert functional.take_last_while(is_not_none, source) == [2, 3, 4] + assert source == [1, 2, 3, 4] + + +def test_take_last_while_str() -> None: + assert functional.take_last_while(lambda x: x != "R", "Ramda") == "amda" + + +def is_not_none(x: int) -> bool: + return x != 1 diff --git a/tests/test_shared/test_ids.py b/tests/test_shared/test_ids.py new file mode 100644 index 0000000..152683e --- /dev/null +++ b/tests/test_shared/test_ids.py @@ -0,0 +1,8 @@ +from parallel_corpus.shared.ids import next_id + + +def test_next_id(): + assert next_id([]) == 0 + assert next_id(["t1", "t2", "t3"]) == 4 + assert next_id(["u2v5k1", "b3", "a0"]) == 6 + assert next_id(["77j66"]) == 78 diff --git a/tests/test_shared/test_lists.py b/tests/test_shared/test_lists.py new file mode 100644 index 0000000..010742a --- /dev/null +++ b/tests/test_shared/test_lists.py @@ -0,0 +1,15 @@ +from parallel_corpus.shared import lists + + +def test_splice_1(): + (*s_chars,) = "abcdef" + ex, rm = lists.splice(s_chars, 3, 1, " ", "_") + assert "".join(ex) == "abc _ef" + assert "".join(rm) == "d" + + +def test_splice_2(): + (*s_chars,) = "abcdef" + (ex, rm) = lists.splice(s_chars, 3, 2, " ", "_") + assert "".join(ex) == "abc _f" + assert "".join(rm) == "de" diff --git a/tests/test_shared/test_ranges.py b/tests/test_shared/test_ranges.py new file mode 100644 index 0000000..ae16bbe --- /dev/null +++ b/tests/test_shared/test_ranges.py @@ -0,0 +1,20 @@ +import pytest +from parallel_corpus.shared.ranges import edit_range + + +@pytest.mark.parametrize( + "s0, s", + [ + ("0123456789", "0189"), + ("0123456789", "01"), + ("0123456789", "89"), + ("0123456789", ""), + ("0123456789", "01xyz89"), + ("0123456789", "01xyz"), + ("0123456789", "xyz89"), + ("0123456789", "xyz"), + ("", "01"), + ], +) +def test_edit_range(s0: str, s: str, snapshot): + assert edit_range(s0, s) == snapshot diff --git a/tests/test_shared/test_union_find.py b/tests/test_shared/test_union_find.py new file mode 100644 index 0000000..696eb44 --- /dev/null +++ b/tests/test_shared/test_union_find.py @@ -0,0 +1,42 @@ +from parallel_corpus.shared.union_find import UnionFind, poly_union_find, renumber + + +def test_union_find() -> None: + uf = UnionFind() + assert uf.find(10) != uf.find(20) + uf.union(10, 20) + assert uf.find(10) == uf.find(20) + uf.union(20, 30) + assert uf.find(10) == uf.find(30) + uf.unions([10, 40, 50]) + assert uf.find(20) == uf.find(40) + assert uf.find(20) == uf.find(50) + + +def test_renumber_default() -> None: + un, num = renumber() # type: ignore [var-annotated] + assert num("foo") == 0 + assert num("bar") == 1 + assert num("foo") == 0 + assert un(0) == "foo" + assert un(1) == "bar" + assert un(2) is None + + +def test_renumber_lowercase() -> None: + un, num = renumber(str.lower) # type: ignore [var-annotated] + + assert num("foo") == 0 + assert num("FOO") == 0 + assert un(0) == "foo" + + +def test_poly_union_find() -> None: + uf = poly_union_find(str.lower) + assert uf.repr("a") == 0 + assert uf.repr("A") == 0 + assert uf.find("a") == "a" + assert uf.find("A") == "a" + assert uf.find("a") != uf.find("b") + assert uf.union("A", "B") + assert uf.find("a") == uf.find("b") diff --git a/tests/test_token.py b/tests/test_token.py new file mode 100644 index 0000000..0c16f5f --- /dev/null +++ b/tests/test_token.py @@ -0,0 +1,36 @@ +from typing import List + +import pytest +from parallel_corpus.token import Token, identify, tokenize + + +def test_can_create_token() -> None: + token = Token(text="a text", id="s0") + + assert token.id == "s0" + assert token.text == "a text" + + +@pytest.mark.parametrize( + "text, expected", + [ + ("", []), + (" ", [" "]), + (" ", [" "]), + ("apa bepa cepa", ["apa ", "bepa ", "cepa "]), + (" apa bepa cepa", [" apa ", "bepa ", "cepa "]), + (" apa bepa cepa ", [" apa ", "bepa ", "cepa "]), + ], +) +def test_tokenize(text: str, expected: List[str], snapshot) -> None: + actual = tokenize(text) + + assert actual == expected + assert actual == snapshot + + +def test_identify() -> None: + assert identify(["apa", "bepa"], "#") == [ + Token(text="apa", id="#0"), + Token(text="bepa", id="#1"), + ]